11package ee .ut .similaritydetector .backend ;
22
3- import java .util .Arrays ;
4-
53public class LevenshteinDistance {
64
75 /**
86 * <p>Adapted from: https://github.com/tdebatty/java-string-similarity/blob/master/src/main/java/info/debatty/java/stringsimilarity/Levenshtein.java [06.03.2021]</p>
97 * <p>Calculates the customised Levenshtein distance metric of two strings.</p>
8+ * <p>There is an optimisation made, that if the distance metric surpasses the given threshold, the algorithm is halted.</p>
109 * <p>Customisations made:
1110 * <ol>
1211 * <li>Same characters with case difference have an edit cost of 0.2 (instead of 1).</li>
@@ -22,29 +21,58 @@ public class LevenshteinDistance {
2221 * <li>ž <-> z</li>
2322 * </ul>
2423 * </li>
25- * </ol></p>
26- * <p>There are two optimizations made:
27- * <ol>
28- * <li>The strings are presented as char arrays for improved efficiency.</li>
29- * <li>If the distance metric surpasses the given threshold, the algorithm is halted.</li>
30- * </ol></p>
31- *
32- * @param s1 char array of string 1
33- * @param s2 char array of string 2
24+ * </ol>
25+ * </p>
26+ * @param s1 String 1
27+ * @param s2 String 2
3428 * @param threshold the max difference threshold
3529 * @return the Levenshtein distance between s1 and s2
3630 */
37- public static double distance (char [] s1 , char [] s2 , int threshold ) {
38- if (s1 == null || s2 == null ) {
31+ public static double distance (String s1 , String s2 , int threshold ) {
32+ if (s1 == null || s2 == null || s1 . length () == 0 || s2 . length () == 0 ) {
3933 throw new NullPointerException ("Strings cannot be null" );
4034 }
4135
42- if (Arrays .equals (s1 , s2 )) {
36+ if (s1 .equals (s2 )) {
37+ return 0 ;
38+ }
39+
40+ // Find the common prefix and suffix of the two strings
41+ // and remove them from the strings in order to make the algorithm faster
42+ StringBuilder sb1 = new StringBuilder (s1 );
43+ StringBuilder sb2 = new StringBuilder (s2 );
44+ int prefixMismatchIndex ;
45+ if (sb1 .length () <= sb2 .length ()) {
46+ prefixMismatchIndex = findFirstMismatchIndex (sb1 , sb2 );
47+ } else {
48+ prefixMismatchIndex = findFirstMismatchIndex (sb2 , sb1 );
49+ }
50+
51+ // If strings are equal, return distance 0
52+ if (prefixMismatchIndex == -1 ) {
4353 return 0 ;
4454 }
55+ sb1 = new StringBuilder (sb1 .substring (prefixMismatchIndex ));
56+ sb2 = new StringBuilder (sb2 .substring (prefixMismatchIndex ));
57+
58+ // For finding suffix mismatch the strings are reversed
59+ sb1 .reverse ();
60+ sb2 .reverse ();
61+
62+ int suffixMismatchIndex ;
63+ if (sb1 .length () <= sb2 .length ()) {
64+ suffixMismatchIndex = findFirstMismatchIndex (sb1 , sb2 );
65+ } else {
66+ suffixMismatchIndex = findFirstMismatchIndex (sb2 , sb1 );
67+ }
68+
69+ sb1 = new StringBuilder (sb1 .substring (suffixMismatchIndex ));
70+ sb2 = new StringBuilder (sb2 .substring (suffixMismatchIndex ));
71+ s1 = sb1 .reverse ().toString ();
72+ s2 = sb2 .reverse ().toString ();
4573
46- int n = s1 .length ;
47- int m = s2 .length ;
74+ int n = s1 .length () ;
75+ int m = s2 .length () ;
4876
4977 double [] prevCosts = new double [m + 1 ];
5078 double [] currCosts = new double [m + 1 ];
@@ -67,12 +95,13 @@ public static double distance(char[] s1, char[] s2, int threshold) {
6795
6896 // Find the costs for the current row to get the minimum edit cost
6997 for (int j = 0 ; j < m ; j ++) {
70- currCosts [j + 1 ] = findCost (s1 , s2 , prevCosts , currCosts , i , j );
98+ currCosts [j + 1 ] = findCost (s1 . charAt ( i ) , s2 . charAt ( j ) , prevCosts , currCosts , j );
7199 minCost = Math .min (minCost , currCosts [j + 1 ]);
72100 }
73101
74- // If the distance threshold is passed
75- if (minCost >= threshold ) {
102+ // If the distance threshold is surpassed,
103+ // then considers the two strings not similar enough
104+ if (minCost > threshold ) {
76105 return -1 ;
77106 }
78107
@@ -84,40 +113,57 @@ public static double distance(char[] s1, char[] s2, int threshold) {
84113 return prevCosts [m ];
85114 }
86115
116+ private static int findFirstMismatchIndex (StringBuilder s1 , StringBuilder s2 ) {
117+ int mismatchIndex = 0 ;
118+ for (int i = 0 , n = s1 .length (); i < n ; i ++) {
119+ if (s1 .charAt (i ) != s2 .charAt (i )) {
120+ return mismatchIndex ;
121+ }
122+ mismatchIndex ++;
123+ }
124+ if (s1 .length () > mismatchIndex || s2 .length () > mismatchIndex ) {
125+ return mismatchIndex ;
126+ }
127+ // Strings are equal
128+ else {
129+ return -1 ;
130+ }
131+ }
132+
133+
87134 /**
88135 * Finds the minimal edit cost for the current characters
89136 *
90- * @param s1 char array of string 1
91- * @param s2 char array of string 2
137+ * @param c1 character from String 1 that is compared
138+ * @param c2 character from String 2 that is compared
92139 * @param prevCosts previous row of costs
93140 * @param currCosts current row of costs
94- * @param i current character index of string 1
95- * @param j current character index of string 2
141+ * @param j the index of character c2 in String 2
96142 * @return the cost for the edit
97143 */
98- public static double findCost (char [] s1 , char [] s2 , double [] prevCosts , double [] currCosts , int i , int j ) {
144+ public static double findCost (char c1 , char c2 , double [] prevCosts , double [] currCosts , int j ) {
99145 // Initial substitution cost is 1
100146 double subCost = 1 ;
101147 // Same characters
102- if (s1 [ i ] == s2 [ j ] ) {
148+ if (c1 == c2 ) {
103149 subCost = 0 ;
104150 }
105151 // Char case difference
106- else if (Character .toLowerCase (s1 [ i ] ) == s2 [ j ] && Character .toLowerCase (s2 [ j ] ) != s2 [ j ] ||
107- Character .toLowerCase (s2 [ j ] ) == s1 [ i ] && Character .toLowerCase (s1 [ i ] ) != s1 [ i ] ) {
152+ else if (Character .toLowerCase (c1 ) == c2 && Character .toLowerCase (c1 ) != c1 ||
153+ Character .toLowerCase (c2 ) == c1 && Character .toLowerCase (c2 ) != c2 ) {
108154 subCost = 0.2 ;
109155 }
110156 // Char substitution cost special cases
111- else if (charSubstitutionCondition (s1 [ i ], s2 [ j ] , '\"' , '\'' ) ||
112- charSubstitutionCondition (s1 [ i ], s2 [ j ] , 'o' , 'õ' ) ||
113- charSubstitutionCondition (s1 [ i ], s2 [ j ] , 'o' , 'ö' ) ||
114- charSubstitutionCondition (s1 [ i ], s2 [ j ] , 'a' , 'ä' ) ||
115- charSubstitutionCondition (s1 [ i ], s2 [ j ] , 'u' , 'ü' ) ||
116- charSubstitutionCondition (s1 [ i ], s2 [ j ] , 'y' , 'ü' ) ||
117- charSubstitutionCondition (s1 [ i ], s2 [ j ] , '2' , 'ä' ) ||
118- charSubstitutionCondition (s1 [ i ], s2 [ j ] , '6' , 'õ' ) ||
119- charSubstitutionCondition (s1 [ i ], s2 [ j ] , 's' , 'š' ) ||
120- charSubstitutionCondition (s1 [ i ], s2 [ j ] , 'z' , 'ž' )) {
157+ else if (charSubstitutionCondition (c1 , c2 , '\"' , '\'' ) ||
158+ charSubstitutionCondition (c1 , c2 , 'o' , 'õ' ) ||
159+ charSubstitutionCondition (c1 , c2 , 'o' , 'ö' ) ||
160+ charSubstitutionCondition (c1 , c2 , 'a' , 'ä' ) ||
161+ charSubstitutionCondition (c1 , c2 , 'u' , 'ü' ) ||
162+ charSubstitutionCondition (c1 , c2 , 'y' , 'ü' ) ||
163+ charSubstitutionCondition (c1 , c2 , '2' , 'ä' ) ||
164+ charSubstitutionCondition (c1 , c2 , '6' , 'õ' ) ||
165+ charSubstitutionCondition (c1 , c2 , 's' , 'š' ) ||
166+ charSubstitutionCondition (c1 , c2 , 'z' , 'ž' )) {
121167 subCost = 0.2 ;
122168 }
123169 return Math .min (
@@ -143,8 +189,15 @@ private static boolean charSubstitutionCondition(char curr1, char curr2, char su
143189 public static double normalisedLevenshteinSimilarity (String s1 , String s2 , float similarityThreshold ) {
144190 int maxLength = Math .max (s1 .length (), s2 .length ());
145191 int distanceThreshold = Math .round ((1 - similarityThreshold ) * maxLength );
146- double levenshteinDistance = distance (s1 .toCharArray (), s2 .toCharArray (), distanceThreshold );
192+ double levenshteinDistance ;
193+ try {
194+ levenshteinDistance = distance (s1 , s2 , distanceThreshold );
195+ } catch (NullPointerException e ) {
196+ // If a string was null or empty then return similarity 0
197+ return 0 ;
198+ }
147199 if (levenshteinDistance == -1 ) {
200+ // If the difference threshold was surpassed, then return similarity 0
148201 return 0 ;
149202 }
150203 return 1.0 - levenshteinDistance / maxLength ;
0 commit comments