-
Notifications
You must be signed in to change notification settings - Fork 0
/
trial3.cpp
129 lines (110 loc) · 3.65 KB
/
trial3.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
//time
#include<chrono>
#include <stdio.h>
#include<iostream>
#include<string>
#include<unordered_set>
#include<fstream>
using namespace std;
//time
using namespace std::chrono;
/*
Unordered_set storing keywords
*/
unordered_set<string> mp;
unordered_set<string> keywords;
void insertion(){
keywords.insert("int");
keywords.insert("float");
keywords.insert("void");
keywords.insert("return");
}
//FILE TO STRING AND FILTER
std::string converttostring(std::string filename = "/Users/sayash/Desktop/Similarity/file1.txt") //default local file
{
std::string s,temp_word; //string to return
FILE *in_file; //open stream
char ch,flag=0;
in_file = fopen(filename.c_str(), "r"); //open file as character string
if (in_file == NULL) //filename error handled
std::cout<<"invalid filename\n";
else
{
while ((ch = fgetc(in_file)) != EOF) //fgetc gets character and points to next character
{
if (ch == ' ' || ch == '\n' || ch == '\t') continue; //de-formatting the code
/*
C++ ignores spaces between statements, tabs, and newline characters
in order to reduce file size by a significant amount, the characters can be deleted and ignored
from edit distance in levenshtein
*/
else if((ch>='A' && ch<='Z') || (ch<='z' && ch>='a')){
if(flag){
while(((ch>='A' && ch<='Z') || (ch<='z' && ch>='a') ||(ch<='9' && ch>='0') )&& ch!=EOF){
//skipping this word
temp_word.push_back(ch);
ch=fgetc(in_file);
}
mp.insert(temp_word);
temp_word="";
if (!(ch == ' ' || ch == '\n' || ch == '\t')) s.push_back(ch);
flag=0;continue;}
while(((ch>='A' && ch<='Z') || (ch<='z' && ch>='a') ||(ch<='9' && ch>='0') )&& ch!=EOF){
temp_word.push_back(ch);
// s+=temp_word;
ch=fgetc(in_file);
// if(keywords.find(temp_word)!=keywords.end()) flag++;
}
if (!(ch == ' ' || ch == '\n' || ch == '\t')) s.push_back(ch);
if(mp.find(temp_word)==mp.end()) s+=temp_word;
// cout<<temp_word<<endl;
if(keywords.find(temp_word)!=keywords.end()) {flag++;}//cout<<1<<endl; inside {}
temp_word="";
}
else{
s.push_back(ch);
}
}
}
fclose(in_file); //closing the stream is very important
return s;
}
//LEVENSHTEIN
int levenshtein(std::string a, std::string b){
int len_a = a.length();
int len_b = b.length();
int d[len_a + 1][len_b+1];
for(int i = 0; i < len_a + 1; i++)
d[i][0] = i;
for(int j = 0; j < len_b + 1; j++)
d[0][j] = j;
for(int i = 1; i < len_a + 1; i++){
for(int j = 1; j < len_b + 1; j++){
if(a[i - 1] == b[j - 1]){
d[i][j] = d[i - 1][j - 1];
}
else{
d[i][j] = 1 + min(min(d[i][j-1],d[i-1][j]),d[i-1][j-1]);
}
}
}
int answer = d[len_a][len_b];
return answer;
}
//MAIN
int main(){
insertion();
//starting clock
auto start = high_resolution_clock::now();
std::string s1 = converttostring("/Users/sayash/Desktop/Similarity/file1.txt");
std::string s2 = converttostring("/Users/sayash/Desktop/Similarity/file3.txt");
cout<<s1<<endl;
cout<<s2<<endl;
//levenshtein comparison
std::cout<<levenshtein(s1,s2)<<std::endl; //final output
//stopping clock
// std::cout<<"time taken is: "<<(double)(clock() - tStart)/CLOCKS_PER_SEC<<std::endl;
auto stop = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop - start);
cout << "Time taken: "<< duration.count() << " microseconds" << endl;
}