Improved Levenshtein distance algorithm.

mikkomaran · mikkomaran · commit 814e4f88286a · 2021-04-28T20:08:02.000+03:00
Changed similarity threshold calculation formula.
Fixed clustering algorithm - all pairs are now correctly assigned to a cluster.
diff --git a/build.gradle b/build.gradle
@@ -29,7 +29,7 @@ dependencies {
     implementation 'org.python:jython-standalone:2.7.2'
     /* Pygments version 2.5.2 is the last version that supports Python 2,
     *  so it's the last version compatible with Jython,
-    *  because the latter doesn't support Python 3 yet */
+    *  because the latter doesn't support Python 3 (yet?) */
     implementation 'org.pygments:pygments:2.5.2'
 }
 
diff --git a/src/main/java/ee/ut/similaritydetector/backend/Analyser.java b/src/main/java/ee/ut/similaritydetector/backend/Analyser.java
@@ -161,7 +161,7 @@ private void compareSolutions(Exercise exercise) {
             for (int j = i + 1; j < solutionCount; j++) {
                 Solution solution2 = solutions.get(j);
                 double similarity = findSimilarity(solution1, solution2, exercise.getSimilarityThreshold());
-                if (similarity > exercise.getSimilarityThreshold()) {
+                if (similarity >= exercise.getSimilarityThreshold()) {
                     solution1.addSimilarSolution(solution2);
                     solution2.addSimilarSolution(solution1);
                     SimilarSolutionPair newSolutionPair;
@@ -207,55 +207,50 @@ private double findSimilarity(Solution sol1, Solution sol2, double similarityThr
      * then A, B and C all belong to the same cluster.
      */
     private void clusterSimilarPairs() {
-        for (SimilarSolutionPair pair1 : similarSolutionPairs) {
-            Solution sol1 = pair1.getFirstSolution();
-            Solution sol2 = pair1.getSecondSolution();
-            SimilarSolutionCluster cluster = null;
+        for (SimilarSolutionPair pair : similarSolutionPairs) {
+            Solution sol1 = pair.getFirstSolution();
+            Solution sol2 = pair.getSecondSolution();
+            SimilarSolutionCluster cluster;
             // If both solutions are not in an existing cluster
             if (similarSolutionClusters.stream().noneMatch(c -> c.containsSolution(sol1)) &&
-                    similarSolutionClusters.stream().noneMatch(c -> c.containsSolution(sol2))) {
+                similarSolutionClusters.stream().noneMatch(c -> c.containsSolution(sol2))) {
                 cluster = new SimilarSolutionCluster(sol1.getExerciseName(), sol1, sol2);
+                cluster.addSolutionPair(pair);
+                similarSolutionClusters.add(cluster);
             }
             // If only first solution is in an existing cluster
-            else if (similarSolutionClusters.stream().anyMatch(c -> c.containsSolution(sol1)) && similarSolutionClusters.stream().noneMatch(c -> c.containsSolution(sol2))) {
+            else if (similarSolutionClusters.stream().anyMatch(c -> c.containsSolution(sol1)) &&
+                     similarSolutionClusters.stream().noneMatch(c -> c.containsSolution(sol2))) {
                 SimilarSolutionCluster existingCluster = similarSolutionClusters.stream().filter(x -> x.getSolutions().contains(sol1)).findAny().get();
                 existingCluster.addSolution(sol2);
-                existingCluster.addSolutionPair(pair1);
+                existingCluster.addSolutionPair(pair);
             }
             // If only second solution is in an existing cluster
-            else if (similarSolutionClusters.stream().noneMatch(c -> c.containsSolution(sol1)) && similarSolutionClusters.stream().anyMatch(c -> c.containsSolution(sol2))) {
+            else if (similarSolutionClusters.stream().noneMatch(c -> c.containsSolution(sol1)) &&
+                     similarSolutionClusters.stream().anyMatch(c -> c.containsSolution(sol2))) {
                 SimilarSolutionCluster existingCluster = similarSolutionClusters.stream().filter(x -> x.getSolutions().contains(sol2)).findAny().get();
                 existingCluster.addSolution(sol1);
-                existingCluster.addSolutionPair(pair1);
+                existingCluster.addSolutionPair(pair);
             }
             // If both are in an existing cluster
-            else if (similarSolutionClusters.stream().anyMatch(c -> c.containsSolution(sol1)) && similarSolutionClusters.stream().anyMatch(c -> c.containsSolution(sol2))) {
+            else if (similarSolutionClusters.stream().anyMatch(c -> c.containsSolution(sol1)) &&
+                     similarSolutionClusters.stream().anyMatch(c -> c.containsSolution(sol2))) {
                 SimilarSolutionCluster existingCluster1 = similarSolutionClusters.stream().filter(x -> x.getSolutions().contains(sol1)).findAny().get();
                 SimilarSolutionCluster existingCluster2 = similarSolutionClusters.stream().filter(x -> x.getSolutions().contains(sol2)).findAny().get();
+                // If both are in the same cluster
                 if (existingCluster1 == existingCluster2) {
-                    existingCluster1.addSolutionPair(pair1);
-                } else {
-                    existingCluster1.addSolution(sol2);
-                    existingCluster1.addSolutionPair(pair1);
-                    existingCluster2.addSolution(sol1);
-                    existingCluster2.addSolutionPair(pair1);
+                    existingCluster1.addSolutionPair(pair);
                 }
-            }
-            if (cluster != null) {
-                cluster.addSolutionPair(pair1);
-                for (SimilarSolutionPair pair2 : similarSolutionPairs) {
-                    if (pair1 == pair2) continue;
-                    if (cluster.containsSolution(pair2.getFirstSolution())) {
-                        cluster.addSolution(pair2.getSecondSolution());
-                        cluster.addSolutionPair(pair2);
-                    } else if (cluster.containsSolution(pair2.getSecondSolution())) {
-                        cluster.addSolution(pair2.getFirstSolution());
-                        cluster.addSolutionPair(pair2);
-                    }
+                // If in a different cluster, then we join the two clusters
+                else {
+                    existingCluster1.addSolutionPair(pair);
+                    existingCluster2.getSolutions().forEach(existingCluster1::addSolution);
+                    existingCluster2.getSolutionPairs().forEach(existingCluster1::addSolutionPair);
+                    similarSolutionClusters.remove(existingCluster2);
                 }
-                similarSolutionClusters.add(cluster);
             }
         }
+        // Create cluster names
         for (SimilarSolutionCluster cluster : similarSolutionClusters) {
             cluster.createName();
         }
diff --git a/src/main/java/ee/ut/similaritydetector/backend/Exercise.java b/src/main/java/ee/ut/similaritydetector/backend/Exercise.java
@@ -82,7 +82,7 @@ public void calculateSimilarityThreshold() {
         if (averageSolutionLength > 100)
             lengthMultiplier = averageSolutionLength / 100;
 
-        similarityThreshold = Math.pow(0.985, lengthMultiplier);
+        similarityThreshold = Math.pow(0.98, lengthMultiplier); //Math.exp(-0.0175 * lengthMultiplier);
         System.out.println(name + " - Used similarity threshold: " +similarityThreshold);
     }
 
diff --git a/src/main/java/ee/ut/similaritydetector/backend/LevenshteinDistance.java b/src/main/java/ee/ut/similaritydetector/backend/LevenshteinDistance.java
@@ -1,12 +1,11 @@
 package ee.ut.similaritydetector.backend;
 
-import java.util.Arrays;
-
 public class LevenshteinDistance {
 
     /**
      * <p>Adapted from: https://github.com/tdebatty/java-string-similarity/blob/master/src/main/java/info/debatty/java/stringsimilarity/Levenshtein.java [06.03.2021]</p>
      * <p>Calculates the customised Levenshtein distance metric of two strings.</p>
+     * <p>There is an optimisation made, that if the distance metric surpasses the given threshold, the algorithm is halted.</p>
      * <p>Customisations made:
      * <ol>
      *     <li>Same characters with case difference have an edit cost of 0.2 (instead of 1).</li>
@@ -22,29 +21,58 @@ public class LevenshteinDistance {
      *         <li>ž <-> z</li>
      *     </ul>
      *     </li>
-     * </ol></p>
-     * <p>There are two optimizations made:
-     * <ol>
-     *     <li>The strings are presented as char arrays for improved efficiency.</li>
-     *     <li>If the distance metric surpasses the given threshold, the algorithm is halted.</li>
-     * </ol></p>
-     *
-     * @param s1        char array of string 1
-     * @param s2        char array of string 2
+     * </ol>
+     * </p>
+     * @param s1        String 1
+     * @param s2        String 2
      * @param threshold the max difference threshold
      * @return the Levenshtein distance between s1 and s2
      */
-    public static double distance(char[] s1, char[] s2, int threshold) {
-        if (s1 == null || s2 == null) {
+    public static double distance(String s1, String s2, int threshold) {
+        if (s1 == null || s2 == null || s1.length() == 0 || s2.length() == 0) {
             throw new NullPointerException("Strings cannot be null");
         }
 
-        if (Arrays.equals(s1, s2)) {
+        if (s1.equals(s2)) {
+            return 0;
+        }
+
+        // Find the common prefix and suffix of the two strings
+        // and remove them from the strings in order to make the algorithm faster
+        StringBuilder sb1 = new StringBuilder(s1);
+        StringBuilder sb2 = new StringBuilder(s2);
+        int prefixMismatchIndex;
+        if (sb1.length() <= sb2.length()) {
+            prefixMismatchIndex = findFirstMismatchIndex(sb1, sb2);
+        } else {
+            prefixMismatchIndex = findFirstMismatchIndex(sb2, sb1);
+        }
+
+        // If strings are equal, return distance 0
+        if (prefixMismatchIndex == -1) {
             return 0;
         }
+        sb1 = new StringBuilder(sb1.substring(prefixMismatchIndex));
+        sb2 = new StringBuilder(sb2.substring(prefixMismatchIndex));
+
+        // For finding suffix mismatch the strings are reversed
+        sb1.reverse();
+        sb2.reverse();
+
+        int suffixMismatchIndex;
+        if (sb1.length() <= sb2.length()) {
+            suffixMismatchIndex = findFirstMismatchIndex(sb1, sb2);
+        } else {
+            suffixMismatchIndex = findFirstMismatchIndex(sb2, sb1);
+        }
+
+        sb1 = new StringBuilder(sb1.substring(suffixMismatchIndex));
+        sb2 = new StringBuilder(sb2.substring(suffixMismatchIndex));
+        s1 = sb1.reverse().toString();
+        s2 = sb2.reverse().toString();
 
-        int n = s1.length;
-        int m = s2.length;
+        int n = s1.length();
+        int m = s2.length();
 
         double[] prevCosts = new double[m + 1];
         double[] currCosts = new double[m + 1];
@@ -67,12 +95,13 @@ public static double distance(char[] s1, char[] s2, int threshold) {
 
             // Find the costs for the current row to get the minimum edit cost
             for (int j = 0; j < m; j++) {
-                currCosts[j + 1] = findCost(s1, s2, prevCosts, currCosts, i, j);
+                currCosts[j + 1] = findCost(s1.charAt(i), s2.charAt(j), prevCosts, currCosts, j);
                 minCost = Math.min(minCost, currCosts[j + 1]);
             }
 
-            // If the distance threshold is passed
-            if (minCost >= threshold) {
+            // If the distance threshold is surpassed,
+            // then considers the two strings not similar enough
+            if (minCost > threshold) {
                 return -1;
             }
 
@@ -84,40 +113,57 @@ public static double distance(char[] s1, char[] s2, int threshold) {
         return prevCosts[m];
     }
 
+    private static int findFirstMismatchIndex(StringBuilder s1, StringBuilder s2) {
+        int mismatchIndex = 0;
+        for (int i = 0, n = s1.length(); i < n; i++) {
+            if (s1.charAt(i) != s2.charAt(i)) {
+                return mismatchIndex;
+            }
+            mismatchIndex ++;
+        }
+        if (s1.length() > mismatchIndex || s2.length() > mismatchIndex) {
+            return mismatchIndex;
+        }
+        // Strings are equal
+        else {
+            return -1;
+        }
+    }
+
+
     /**
      * Finds the minimal edit cost for the current characters
      *
-     * @param s1 char array of string 1
-     * @param s2 char array of string 2
+     * @param c1 character from String 1 that is compared
+     * @param c2 character from String 2 that is compared
      * @param prevCosts previous row of costs
      * @param currCosts current row of costs
-     * @param i current character index of string 1
-     * @param j current character index of string 2
+     * @param j the index of character c2 in String 2
      * @return the cost for the edit
      */
-    public static double findCost(char[] s1, char[] s2, double[] prevCosts, double[] currCosts, int i, int j) {
+    public static double findCost(char c1, char c2, double[] prevCosts, double[] currCosts, int j) {
         // Initial substitution cost is 1
         double subCost = 1;
         // Same characters
-        if (s1[i] == s2[j]) {
+        if (c1 == c2) {
             subCost = 0;
         }
         // Char case difference
-        else if (Character.toLowerCase(s1[i]) == s2[j] && Character.toLowerCase(s2[j]) != s2[j] ||
-                Character.toLowerCase(s2[j]) == s1[i] && Character.toLowerCase(s1[i]) != s1[i]) {
+        else if (Character.toLowerCase(c1) == c2 && Character.toLowerCase(c1) != c1 ||
+                Character.toLowerCase(c2) == c1 && Character.toLowerCase(c2) != c2) {
             subCost = 0.2;
         }
         // Char substitution cost special cases
-        else if (charSubstitutionCondition(s1[i], s2[j], '\"', '\'') ||
-                charSubstitutionCondition(s1[i], s2[j], 'o', 'õ') ||
-                charSubstitutionCondition(s1[i], s2[j], 'o', 'ö') ||
-                charSubstitutionCondition(s1[i], s2[j], 'a', 'ä') ||
-                charSubstitutionCondition(s1[i], s2[j], 'u', 'ü') ||
-                charSubstitutionCondition(s1[i], s2[j], 'y', 'ü') ||
-                charSubstitutionCondition(s1[i], s2[j], '2', 'ä') ||
-                charSubstitutionCondition(s1[i], s2[j], '6', 'õ') ||
-                charSubstitutionCondition(s1[i], s2[j], 's', 'š') ||
-                charSubstitutionCondition(s1[i], s2[j], 'z', 'ž')) {
+        else if (charSubstitutionCondition(c1, c2, '\"', '\'') ||
+                charSubstitutionCondition(c1, c2, 'o', 'õ') ||
+                charSubstitutionCondition(c1, c2, 'o', 'ö') ||
+                charSubstitutionCondition(c1, c2, 'a', 'ä') ||
+                charSubstitutionCondition(c1, c2, 'u', 'ü') ||
+                charSubstitutionCondition(c1, c2, 'y', 'ü') ||
+                charSubstitutionCondition(c1, c2, '2', 'ä') ||
+                charSubstitutionCondition(c1, c2, '6', 'õ') ||
+                charSubstitutionCondition(c1, c2, 's', 'š') ||
+                charSubstitutionCondition(c1, c2, 'z', 'ž')) {
             subCost = 0.2;
         }
         return Math.min(
@@ -143,8 +189,15 @@ private static boolean charSubstitutionCondition(char curr1, char curr2, char su
     public static double normalisedLevenshteinSimilarity(String s1, String s2, float similarityThreshold) {
         int maxLength = Math.max(s1.length(), s2.length());
         int distanceThreshold = Math.round((1 - similarityThreshold) * maxLength);
-        double levenshteinDistance = distance(s1.toCharArray(), s2.toCharArray(), distanceThreshold);
+        double levenshteinDistance;
+        try {
+            levenshteinDistance = distance(s1, s2, distanceThreshold);
+        } catch (NullPointerException e) {
+            // If a string was null or empty then return similarity 0
+            return 0;
+        }
         if (levenshteinDistance == -1) {
+            // If the difference threshold was surpassed, then return similarity 0
             return 0;
         }
         return 1.0 - levenshteinDistance / maxLength;
diff --git a/src/main/java/ee/ut/similaritydetector/backend/Solution.java b/src/main/java/ee/ut/similaritydetector/backend/Solution.java
@@ -147,4 +147,8 @@ public void generateSyntaxHighlightedHTML() {
         sourceCodeHTMLLight = new File(lightHTMLPath);
     }
 
+    @Override
+    public String toString() {
+        return author + " - " + exerciseName;
+    }
 }
diff --git a/src/main/resources/ee/ut/similaritydetector/python/Preprocessor.py b/src/main/resources/ee/ut/similaritydetector/python/Preprocessor.py
@@ -9,7 +9,6 @@
 """ Function taken from: https://stackoverflow.com/questions/1769332/script-to-remove-python-comments-docstrings [04.03.2021]
     Original author: Dan McDougall (https://stackoverflow.com/users/357007/dan-mcdougall) 
     Modifications made by: Basj (https://stackoverflow.com/users/1422096/basj)
-    I also made some customisations.
 """
 def preprocess_source_code(source_code):
     source_code_io = io.BytesIO(source_code)
@@ -44,6 +43,7 @@ def preprocess_source_code(source_code):
     return output
 
 
+# Starting the preprocessing
 preprocessed_code_filepath = source_code_filepath[0: len(source_code_filepath) - 3] + "_preprocessed.py"
 
 with open(source_code_filepath, 'rb') as source_code_file:

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ dependencies {`
`29`	`29`	`implementation 'org.python:jython-standalone:2.7.2'`
`30`	`30`	`/* Pygments version 2.5.2 is the last version that supports Python 2,`
`31`	`31`	`* so it's the last version compatible with Jython,`
`32`		`- * because the latter doesn't support Python 3 yet */`
	`32`	`+ * because the latter doesn't support Python 3 (yet?) */`
`33`	`33`	`implementation 'org.pygments:pygments:2.5.2'`
`34`	`34`	`}`
`35`	`35`
Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,7 @@ public void calculateSimilarityThreshold() {`
`82`	`82`	`if (averageSolutionLength > 100)`
`83`	`83`	`lengthMultiplier = averageSolutionLength / 100;`
`84`	`84`
`85`		`- similarityThreshold = Math.pow(0.985, lengthMultiplier);`
	`85`	`+ similarityThreshold = Math.pow(0.98, lengthMultiplier); //Math.exp(-0.0175 * lengthMultiplier);`
`86`	`86`	`System.out.println(name + " - Used similarity threshold: " +similarityThreshold);`
`87`	`87`	`}`
`88`	`88`
Original file line number	Diff line number	Diff line change
`@@ -147,4 +147,8 @@ public void generateSyntaxHighlightedHTML() {`
`147`	`147`	`sourceCodeHTMLLight = new File(lightHTMLPath);`
`148`	`148`	`}`
`149`	`149`
	`150`	`+ @Override`
	`151`	`+ public String toString() {`
	`152`	`+ return author + " - " + exerciseName;`
	`153`	`+ }`
`150`	`154`	`}`