Skip to content

Commit 814e4f8

Browse files
committed
Improved Levenshtein distance algorithm.
Changed similarity threshold calculation formula. Fixed clustering algorithm - all pairs are now correctly assigned to a cluster.
1 parent 998aeb8 commit 814e4f8

File tree

6 files changed

+123
-71
lines changed

6 files changed

+123
-71
lines changed

build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ dependencies {
2929
implementation 'org.python:jython-standalone:2.7.2'
3030
/* Pygments version 2.5.2 is the last version that supports Python 2,
3131
* so it's the last version compatible with Jython,
32-
* because the latter doesn't support Python 3 yet */
32+
* because the latter doesn't support Python 3 (yet?) */
3333
implementation 'org.pygments:pygments:2.5.2'
3434
}
3535

src/main/java/ee/ut/similaritydetector/backend/Analyser.java

Lines changed: 25 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ private void compareSolutions(Exercise exercise) {
161161
for (int j = i + 1; j < solutionCount; j++) {
162162
Solution solution2 = solutions.get(j);
163163
double similarity = findSimilarity(solution1, solution2, exercise.getSimilarityThreshold());
164-
if (similarity > exercise.getSimilarityThreshold()) {
164+
if (similarity >= exercise.getSimilarityThreshold()) {
165165
solution1.addSimilarSolution(solution2);
166166
solution2.addSimilarSolution(solution1);
167167
SimilarSolutionPair newSolutionPair;
@@ -207,55 +207,50 @@ private double findSimilarity(Solution sol1, Solution sol2, double similarityThr
207207
* then A, B and C all belong to the same cluster.
208208
*/
209209
private void clusterSimilarPairs() {
210-
for (SimilarSolutionPair pair1 : similarSolutionPairs) {
211-
Solution sol1 = pair1.getFirstSolution();
212-
Solution sol2 = pair1.getSecondSolution();
213-
SimilarSolutionCluster cluster = null;
210+
for (SimilarSolutionPair pair : similarSolutionPairs) {
211+
Solution sol1 = pair.getFirstSolution();
212+
Solution sol2 = pair.getSecondSolution();
213+
SimilarSolutionCluster cluster;
214214
// If both solutions are not in an existing cluster
215215
if (similarSolutionClusters.stream().noneMatch(c -> c.containsSolution(sol1)) &&
216-
similarSolutionClusters.stream().noneMatch(c -> c.containsSolution(sol2))) {
216+
similarSolutionClusters.stream().noneMatch(c -> c.containsSolution(sol2))) {
217217
cluster = new SimilarSolutionCluster(sol1.getExerciseName(), sol1, sol2);
218+
cluster.addSolutionPair(pair);
219+
similarSolutionClusters.add(cluster);
218220
}
219221
// If only first solution is in an existing cluster
220-
else if (similarSolutionClusters.stream().anyMatch(c -> c.containsSolution(sol1)) && similarSolutionClusters.stream().noneMatch(c -> c.containsSolution(sol2))) {
222+
else if (similarSolutionClusters.stream().anyMatch(c -> c.containsSolution(sol1)) &&
223+
similarSolutionClusters.stream().noneMatch(c -> c.containsSolution(sol2))) {
221224
SimilarSolutionCluster existingCluster = similarSolutionClusters.stream().filter(x -> x.getSolutions().contains(sol1)).findAny().get();
222225
existingCluster.addSolution(sol2);
223-
existingCluster.addSolutionPair(pair1);
226+
existingCluster.addSolutionPair(pair);
224227
}
225228
// If only second solution is in an existing cluster
226-
else if (similarSolutionClusters.stream().noneMatch(c -> c.containsSolution(sol1)) && similarSolutionClusters.stream().anyMatch(c -> c.containsSolution(sol2))) {
229+
else if (similarSolutionClusters.stream().noneMatch(c -> c.containsSolution(sol1)) &&
230+
similarSolutionClusters.stream().anyMatch(c -> c.containsSolution(sol2))) {
227231
SimilarSolutionCluster existingCluster = similarSolutionClusters.stream().filter(x -> x.getSolutions().contains(sol2)).findAny().get();
228232
existingCluster.addSolution(sol1);
229-
existingCluster.addSolutionPair(pair1);
233+
existingCluster.addSolutionPair(pair);
230234
}
231235
// If both are in an existing cluster
232-
else if (similarSolutionClusters.stream().anyMatch(c -> c.containsSolution(sol1)) && similarSolutionClusters.stream().anyMatch(c -> c.containsSolution(sol2))) {
236+
else if (similarSolutionClusters.stream().anyMatch(c -> c.containsSolution(sol1)) &&
237+
similarSolutionClusters.stream().anyMatch(c -> c.containsSolution(sol2))) {
233238
SimilarSolutionCluster existingCluster1 = similarSolutionClusters.stream().filter(x -> x.getSolutions().contains(sol1)).findAny().get();
234239
SimilarSolutionCluster existingCluster2 = similarSolutionClusters.stream().filter(x -> x.getSolutions().contains(sol2)).findAny().get();
240+
// If both are in the same cluster
235241
if (existingCluster1 == existingCluster2) {
236-
existingCluster1.addSolutionPair(pair1);
237-
} else {
238-
existingCluster1.addSolution(sol2);
239-
existingCluster1.addSolutionPair(pair1);
240-
existingCluster2.addSolution(sol1);
241-
existingCluster2.addSolutionPair(pair1);
242+
existingCluster1.addSolutionPair(pair);
242243
}
243-
}
244-
if (cluster != null) {
245-
cluster.addSolutionPair(pair1);
246-
for (SimilarSolutionPair pair2 : similarSolutionPairs) {
247-
if (pair1 == pair2) continue;
248-
if (cluster.containsSolution(pair2.getFirstSolution())) {
249-
cluster.addSolution(pair2.getSecondSolution());
250-
cluster.addSolutionPair(pair2);
251-
} else if (cluster.containsSolution(pair2.getSecondSolution())) {
252-
cluster.addSolution(pair2.getFirstSolution());
253-
cluster.addSolutionPair(pair2);
254-
}
244+
// If in a different cluster, then we join the two clusters
245+
else {
246+
existingCluster1.addSolutionPair(pair);
247+
existingCluster2.getSolutions().forEach(existingCluster1::addSolution);
248+
existingCluster2.getSolutionPairs().forEach(existingCluster1::addSolutionPair);
249+
similarSolutionClusters.remove(existingCluster2);
255250
}
256-
similarSolutionClusters.add(cluster);
257251
}
258252
}
253+
// Create cluster names
259254
for (SimilarSolutionCluster cluster : similarSolutionClusters) {
260255
cluster.createName();
261256
}

src/main/java/ee/ut/similaritydetector/backend/Exercise.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ public void calculateSimilarityThreshold() {
8282
if (averageSolutionLength > 100)
8383
lengthMultiplier = averageSolutionLength / 100;
8484

85-
similarityThreshold = Math.pow(0.985, lengthMultiplier);
85+
similarityThreshold = Math.pow(0.98, lengthMultiplier); //Math.exp(-0.0175 * lengthMultiplier);
8686
System.out.println(name + " - Used similarity threshold: " +similarityThreshold);
8787
}
8888

src/main/java/ee/ut/similaritydetector/backend/LevenshteinDistance.java

Lines changed: 91 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
package ee.ut.similaritydetector.backend;
22

3-
import java.util.Arrays;
4-
53
public class LevenshteinDistance {
64

75
/**
86
* <p>Adapted from: https://github.com/tdebatty/java-string-similarity/blob/master/src/main/java/info/debatty/java/stringsimilarity/Levenshtein.java [06.03.2021]</p>
97
* <p>Calculates the customised Levenshtein distance metric of two strings.</p>
8+
* <p>There is an optimisation made, that if the distance metric surpasses the given threshold, the algorithm is halted.</p>
109
* <p>Customisations made:
1110
* <ol>
1211
* <li>Same characters with case difference have an edit cost of 0.2 (instead of 1).</li>
@@ -22,29 +21,58 @@ public class LevenshteinDistance {
2221
* <li>ž <-> z</li>
2322
* </ul>
2423
* </li>
25-
* </ol></p>
26-
* <p>There are two optimizations made:
27-
* <ol>
28-
* <li>The strings are presented as char arrays for improved efficiency.</li>
29-
* <li>If the distance metric surpasses the given threshold, the algorithm is halted.</li>
30-
* </ol></p>
31-
*
32-
* @param s1 char array of string 1
33-
* @param s2 char array of string 2
24+
* </ol>
25+
* </p>
26+
* @param s1 String 1
27+
* @param s2 String 2
3428
* @param threshold the max difference threshold
3529
* @return the Levenshtein distance between s1 and s2
3630
*/
37-
public static double distance(char[] s1, char[] s2, int threshold) {
38-
if (s1 == null || s2 == null) {
31+
public static double distance(String s1, String s2, int threshold) {
32+
if (s1 == null || s2 == null || s1.length() == 0 || s2.length() == 0) {
3933
throw new NullPointerException("Strings cannot be null");
4034
}
4135

42-
if (Arrays.equals(s1, s2)) {
36+
if (s1.equals(s2)) {
37+
return 0;
38+
}
39+
40+
// Find the common prefix and suffix of the two strings
41+
// and remove them from the strings in order to make the algorithm faster
42+
StringBuilder sb1 = new StringBuilder(s1);
43+
StringBuilder sb2 = new StringBuilder(s2);
44+
int prefixMismatchIndex;
45+
if (sb1.length() <= sb2.length()) {
46+
prefixMismatchIndex = findFirstMismatchIndex(sb1, sb2);
47+
} else {
48+
prefixMismatchIndex = findFirstMismatchIndex(sb2, sb1);
49+
}
50+
51+
// If strings are equal, return distance 0
52+
if (prefixMismatchIndex == -1) {
4353
return 0;
4454
}
55+
sb1 = new StringBuilder(sb1.substring(prefixMismatchIndex));
56+
sb2 = new StringBuilder(sb2.substring(prefixMismatchIndex));
57+
58+
// For finding suffix mismatch the strings are reversed
59+
sb1.reverse();
60+
sb2.reverse();
61+
62+
int suffixMismatchIndex;
63+
if (sb1.length() <= sb2.length()) {
64+
suffixMismatchIndex = findFirstMismatchIndex(sb1, sb2);
65+
} else {
66+
suffixMismatchIndex = findFirstMismatchIndex(sb2, sb1);
67+
}
68+
69+
sb1 = new StringBuilder(sb1.substring(suffixMismatchIndex));
70+
sb2 = new StringBuilder(sb2.substring(suffixMismatchIndex));
71+
s1 = sb1.reverse().toString();
72+
s2 = sb2.reverse().toString();
4573

46-
int n = s1.length;
47-
int m = s2.length;
74+
int n = s1.length();
75+
int m = s2.length();
4876

4977
double[] prevCosts = new double[m + 1];
5078
double[] currCosts = new double[m + 1];
@@ -67,12 +95,13 @@ public static double distance(char[] s1, char[] s2, int threshold) {
6795

6896
// Find the costs for the current row to get the minimum edit cost
6997
for (int j = 0; j < m; j++) {
70-
currCosts[j + 1] = findCost(s1, s2, prevCosts, currCosts, i, j);
98+
currCosts[j + 1] = findCost(s1.charAt(i), s2.charAt(j), prevCosts, currCosts, j);
7199
minCost = Math.min(minCost, currCosts[j + 1]);
72100
}
73101

74-
// If the distance threshold is passed
75-
if (minCost >= threshold) {
102+
// If the distance threshold is surpassed,
103+
// then considers the two strings not similar enough
104+
if (minCost > threshold) {
76105
return -1;
77106
}
78107

@@ -84,40 +113,57 @@ public static double distance(char[] s1, char[] s2, int threshold) {
84113
return prevCosts[m];
85114
}
86115

116+
private static int findFirstMismatchIndex(StringBuilder s1, StringBuilder s2) {
117+
int mismatchIndex = 0;
118+
for (int i = 0, n = s1.length(); i < n; i++) {
119+
if (s1.charAt(i) != s2.charAt(i)) {
120+
return mismatchIndex;
121+
}
122+
mismatchIndex ++;
123+
}
124+
if (s1.length() > mismatchIndex || s2.length() > mismatchIndex) {
125+
return mismatchIndex;
126+
}
127+
// Strings are equal
128+
else {
129+
return -1;
130+
}
131+
}
132+
133+
87134
/**
88135
* Finds the minimal edit cost for the current characters
89136
*
90-
* @param s1 char array of string 1
91-
* @param s2 char array of string 2
137+
* @param c1 character from String 1 that is compared
138+
* @param c2 character from String 2 that is compared
92139
* @param prevCosts previous row of costs
93140
* @param currCosts current row of costs
94-
* @param i current character index of string 1
95-
* @param j current character index of string 2
141+
* @param j the index of character c2 in String 2
96142
* @return the cost for the edit
97143
*/
98-
public static double findCost(char[] s1, char[] s2, double[] prevCosts, double[] currCosts, int i, int j) {
144+
public static double findCost(char c1, char c2, double[] prevCosts, double[] currCosts, int j) {
99145
// Initial substitution cost is 1
100146
double subCost = 1;
101147
// Same characters
102-
if (s1[i] == s2[j]) {
148+
if (c1 == c2) {
103149
subCost = 0;
104150
}
105151
// Char case difference
106-
else if (Character.toLowerCase(s1[i]) == s2[j] && Character.toLowerCase(s2[j]) != s2[j] ||
107-
Character.toLowerCase(s2[j]) == s1[i] && Character.toLowerCase(s1[i]) != s1[i]) {
152+
else if (Character.toLowerCase(c1) == c2 && Character.toLowerCase(c1) != c1 ||
153+
Character.toLowerCase(c2) == c1 && Character.toLowerCase(c2) != c2) {
108154
subCost = 0.2;
109155
}
110156
// Char substitution cost special cases
111-
else if (charSubstitutionCondition(s1[i], s2[j], '\"', '\'') ||
112-
charSubstitutionCondition(s1[i], s2[j], 'o', 'õ') ||
113-
charSubstitutionCondition(s1[i], s2[j], 'o', 'ö') ||
114-
charSubstitutionCondition(s1[i], s2[j], 'a', 'ä') ||
115-
charSubstitutionCondition(s1[i], s2[j], 'u', 'ü') ||
116-
charSubstitutionCondition(s1[i], s2[j], 'y', 'ü') ||
117-
charSubstitutionCondition(s1[i], s2[j], '2', 'ä') ||
118-
charSubstitutionCondition(s1[i], s2[j], '6', 'õ') ||
119-
charSubstitutionCondition(s1[i], s2[j], 's', 'š') ||
120-
charSubstitutionCondition(s1[i], s2[j], 'z', 'ž')) {
157+
else if (charSubstitutionCondition(c1, c2, '\"', '\'') ||
158+
charSubstitutionCondition(c1, c2, 'o', 'õ') ||
159+
charSubstitutionCondition(c1, c2, 'o', 'ö') ||
160+
charSubstitutionCondition(c1, c2, 'a', 'ä') ||
161+
charSubstitutionCondition(c1, c2, 'u', 'ü') ||
162+
charSubstitutionCondition(c1, c2, 'y', 'ü') ||
163+
charSubstitutionCondition(c1, c2, '2', 'ä') ||
164+
charSubstitutionCondition(c1, c2, '6', 'õ') ||
165+
charSubstitutionCondition(c1, c2, 's', 'š') ||
166+
charSubstitutionCondition(c1, c2, 'z', 'ž')) {
121167
subCost = 0.2;
122168
}
123169
return Math.min(
@@ -143,8 +189,15 @@ private static boolean charSubstitutionCondition(char curr1, char curr2, char su
143189
public static double normalisedLevenshteinSimilarity(String s1, String s2, float similarityThreshold) {
144190
int maxLength = Math.max(s1.length(), s2.length());
145191
int distanceThreshold = Math.round((1 - similarityThreshold) * maxLength);
146-
double levenshteinDistance = distance(s1.toCharArray(), s2.toCharArray(), distanceThreshold);
192+
double levenshteinDistance;
193+
try {
194+
levenshteinDistance = distance(s1, s2, distanceThreshold);
195+
} catch (NullPointerException e) {
196+
// If a string was null or empty then return similarity 0
197+
return 0;
198+
}
147199
if (levenshteinDistance == -1) {
200+
// If the difference threshold was surpassed, then return similarity 0
148201
return 0;
149202
}
150203
return 1.0 - levenshteinDistance / maxLength;

src/main/java/ee/ut/similaritydetector/backend/Solution.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,4 +147,8 @@ public void generateSyntaxHighlightedHTML() {
147147
sourceCodeHTMLLight = new File(lightHTMLPath);
148148
}
149149

150+
@Override
151+
public String toString() {
152+
return author + " - " + exerciseName;
153+
}
150154
}

src/main/resources/ee/ut/similaritydetector/python/Preprocessor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
""" Function taken from: https://stackoverflow.com/questions/1769332/script-to-remove-python-comments-docstrings [04.03.2021]
1010
Original author: Dan McDougall (https://stackoverflow.com/users/357007/dan-mcdougall)
1111
Modifications made by: Basj (https://stackoverflow.com/users/1422096/basj)
12-
I also made some customisations.
1312
"""
1413
def preprocess_source_code(source_code):
1514
source_code_io = io.BytesIO(source_code)
@@ -44,6 +43,7 @@ def preprocess_source_code(source_code):
4443
return output
4544

4645

46+
# Starting the preprocessing
4747
preprocessed_code_filepath = source_code_filepath[0: len(source_code_filepath) - 3] + "_preprocessed.py"
4848

4949
with open(source_code_filepath, 'rb') as source_code_file:

0 commit comments

Comments
 (0)