Similarity refactoring (#803)

mneedham · web-flow · commit 6a589a5e0c24 · 2019-02-01T11:54:20.000Z
* Delegate pearson function to the array based computation

* Delegate cosine function to the array based computation

* Delegate euclidean function to the array based computation

* similarity vector aggregation function to save users doing the boring collect function

* adding more tests  + pushing NaN logic into Intersections

* nicer switch config name

* better name for the agg function

* category instead of id
diff --git a/algo/pom.xml b/algo/pom.xml
@@ -101,6 +101,14 @@
             <version>${neo4j.version}</version>
             <scope>provided</scope>
         </dependency>
+
+        <dependency>
+            <groupId>org.mockito</groupId>
+            <artifactId>mockito-core</artifactId>
+            <version>2.23.4</version>
+            <scope>test</scope>
+        </dependency>
+
     </dependencies>
 
     <build>
diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/Similarities.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/Similarities.java
@@ -18,12 +18,22 @@
  */
 package org.neo4j.graphalgo.similarity;
 
-import org.neo4j.procedure.Description;
-import org.neo4j.procedure.Name;
-import org.neo4j.procedure.UserFunction;
-
+import com.carrotsearch.hppc.LongDoubleHashMap;
+import com.carrotsearch.hppc.LongDoubleMap;
+import com.carrotsearch.hppc.LongHashSet;
+import com.carrotsearch.hppc.LongSet;
+import com.carrotsearch.hppc.cursors.LongCursor;
+import org.neo4j.graphalgo.core.ProcedureConfiguration;
+import org.neo4j.graphalgo.core.utils.Intersections;
+import org.neo4j.procedure.*;
+
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
+
+import static org.neo4j.graphalgo.similarity.SimilarityVectorAggregator.CATEGORY_KEY;
+import static org.neo4j.graphalgo.similarity.SimilarityVectorAggregator.WEIGHT_KEY;
 
 public class Similarities {
 
@@ -49,51 +59,83 @@ public double cosineSimilarity(@Name("vector1") List<Number> vector1, @Name("vec
             throw new RuntimeException("Vectors must be non-empty and of the same size");
         }
 
-        double dotProduct = 0d;
-        double xLength = 0d;
-        double yLength = 0d;
-        for (int i = 0; i < vector1.size(); i++) {
-            double weight1 = vector1.get(i).doubleValue();
-            double weight2 = vector2.get(i).doubleValue();
+        int len = Math.min(vector1.size(), vector2.size());
+        double[] weights1 = new double[len];
+        double[] weights2 = new double[len];
 
-            dotProduct += weight1 * weight2;
-            xLength += weight1 * weight1;
-            yLength += weight2 * weight2;
+        for (int i = 0; i < len; i++) {
+            weights1[i] = vector1.get(i).doubleValue();
+            weights2[i] = vector2.get(i).doubleValue();
         }
 
-        xLength = Math.sqrt(xLength);
-        yLength = Math.sqrt(yLength);
+        return Math.sqrt(Intersections.cosineSquare(weights1, weights2, len));
+    }
 
-        return dotProduct / (xLength * yLength);
+    @UserAggregationFunction("algo.similarity.asVector")
+    @Description("algo.similarity.asVector - builds a vector of maps containing items and weights")
+    public SimilarityVectorAggregator asVector() {
+        return new SimilarityVectorAggregator();
     }
 
     @UserFunction("algo.similarity.pearson")
     @Description("algo.similarity.pearson([vector1], [vector2]) " +
             "given two collection vectors, calculate pearson similarity")
-    public double pearsonSimilarity(@Name("vector1") List<Number> vector1, @Name("vector2") List<Number> vector2) {
-        if (vector1.size() != vector2.size() || vector1.size() == 0) {
-            throw new RuntimeException("Vectors must be non-empty and of the same size");
+    public double pearsonSimilarity(@Name("vector1") Object rawVector1, @Name("vector2") Object rawVector2, @Name(value = "config", defaultValue = "{}") Map<String, Object> config) {
+        ProcedureConfiguration configuration = ProcedureConfiguration.create(config);
+
+        String listType = configuration.get("vectorType", "numbers");
+
+        if (listType.equalsIgnoreCase("maps")) {
+            List<Map<String, Object>> vector1 = (List<Map<String, Object>>) rawVector1;
+            List<Map<String, Object>> vector2 = (List<Map<String, Object>>) rawVector2;
+
+            LongSet ids = new LongHashSet();
+
+            LongDoubleMap v1Mappings = new LongDoubleHashMap();
+            for (Map<String, Object> entry : vector1) {
+                Long id = (Long) entry.get(CATEGORY_KEY);
+                ids.add(id);
+                v1Mappings.put(id, (Double) entry.get(WEIGHT_KEY));
+            }
+
+            LongDoubleMap v2Mappings = new LongDoubleHashMap();
+            for (Map<String, Object> entry : vector2) {
+                Long id = (Long) entry.get(CATEGORY_KEY);
+                ids.add(id);
+                v2Mappings.put(id, (Double) entry.get(WEIGHT_KEY));
+            }
+
+            double[] weights1 = new double[ids.size()];
+            double[] weights2 = new double[ids.size()];
+
+            double skipValue = Double.NaN;
+            int index = 0;
+            for (long id : ids.toArray()) {
+                weights1[index] = v1Mappings.getOrDefault(id, skipValue);
+                weights2[index] = v2Mappings.getOrDefault(id, skipValue);
+                index++;
+            }
+
+            return Intersections.pearsonSkip(weights1, weights2, ids.size(), skipValue);
+        } else {
+            List<Number> vector1 = (List<Number>) rawVector1;
+            List<Number> vector2 = (List<Number>) rawVector2;
+
+            if (vector1.size() != vector2.size() || vector1.size() == 0) {
+                throw new RuntimeException("Vectors must be non-empty and of the same size");
+            }
+
+            int len = vector1.size();
+            double[] weights1 = new double[len];
+            double[] weights2 = new double[len];
+
+            for (int i = 0; i < len; i++) {
+                weights1[i] = vector1.get(i).doubleValue();
+                weights2[i] = vector2.get(i).doubleValue();
+            }
+            return Intersections.pearson(weights1, weights2, len);
         }
 
-        double vector1Mean = vector1.stream().mapToDouble(Number::doubleValue).average().orElse(1);
-        double vector2Mean = vector2.stream().mapToDouble(Number::doubleValue).average().orElse(1);
-
-        double dotProductMinusMean = 0d;
-        double xLength = 0d;
-        double yLength = 0d;
-        for (int i = 0; i < vector1.size(); i++) {
-            double weight1 = vector1.get(i).doubleValue();
-            double weight2 = vector2.get(i).doubleValue();
-
-            double vector1Delta = weight1 - vector1Mean;
-            double vector2Delta = weight2 - vector2Mean;
-
-            dotProductMinusMean += (vector1Delta * vector2Delta);
-            xLength += vector1Delta * vector1Delta;
-            yLength += vector2Delta * vector2Delta;
-        }
-
-        return dotProductMinusMean / (Math.sqrt(xLength * yLength));
     }
 
     @UserFunction("algo.similarity.euclideanDistance")
@@ -104,15 +146,16 @@ public double euclideanDistance(@Name("vector1") List<Number> vector1, @Name("ve
             throw new RuntimeException("Vectors must be non-empty and of the same size");
         }
 
-        double distance = 0.0;
-        for (int i = 0; i < vector1.size(); i++) {
-            double sqOfDiff = vector1.get(i).doubleValue() - vector2.get(i).doubleValue();
-            sqOfDiff *= sqOfDiff;
-            distance += sqOfDiff;
+        int len = Math.min(vector1.size(), vector2.size());
+        double[] weights1 = new double[len];
+        double[] weights2 = new double[len];
+
+        for (int i = 0; i < len; i++) {
+            weights1[i] = vector1.get(i).doubleValue();
+            weights2[i] = vector2.get(i).doubleValue();
         }
-        distance = Math.sqrt(distance);
 
-        return distance;
+        return Math.sqrt(Intersections.sumSquareDelta(weights1, weights2, len));
     }
 
     @UserFunction("algo.similarity.euclidean")
diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityVectorAggregator.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/SimilarityVectorAggregator.java
@@ -0,0 +1,28 @@
+package org.neo4j.graphalgo.similarity;
+
+import org.neo4j.graphdb.Node;
+import org.neo4j.helpers.collection.MapUtil;
+import org.neo4j.procedure.Name;
+import org.neo4j.procedure.UserAggregationResult;
+import org.neo4j.procedure.UserAggregationUpdate;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+public class SimilarityVectorAggregator {
+    private List<Map<String, Object>> vector = new ArrayList<>();
+    public static String CATEGORY_KEY = "category";
+    public static String WEIGHT_KEY = "weight";
+
+    @UserAggregationUpdate
+    public void next(
+            @Name("node") Node node, @Name("weight") double weight) {
+        vector.add(MapUtil.map(CATEGORY_KEY, node.getId(), WEIGHT_KEY, weight));
+    }
+
+    @UserAggregationResult
+    public List<Map<String, Object>> result() {
+        return vector;
+    }
+}
diff --git a/algo/src/main/java/org/neo4j/graphalgo/similarity/WeightedInput.java b/algo/src/main/java/org/neo4j/graphalgo/similarity/WeightedInput.java
@@ -127,7 +127,6 @@ public SimilarityResult pearson(RleDecoder decoder, double similarityCutoff, Wei
 
         int len = Math.min(thisWeights.length, otherWeights.length);
         double pearson = Intersections.pearson(thisWeights, otherWeights, len);
-        pearson = Double.isNaN(pearson) ? 0 : pearson;
 
         if (similarityCutoff >= 0d && (pearson == 0 || pearson < similarityCutoff)) return null;
 
@@ -145,7 +144,6 @@ public SimilarityResult pearsonSkip(RleDecoder decoder, double similarityCutoff,
 
         int len = Math.min(thisWeights.length, otherWeights.length);
         double pearson = Intersections.pearsonSkip(thisWeights, otherWeights, len, skipValue);
-        pearson = Double.isNaN(pearson) ? 0 : pearson;
 
         if (similarityCutoff >= 0d && (pearson == 0 || pearson < similarityCutoff)) return null;
 
diff --git a/algo/src/test/java/org/neo4j/graphalgo/similarity/SimilarityVectorAggregatorTest.java b/algo/src/test/java/org/neo4j/graphalgo/similarity/SimilarityVectorAggregatorTest.java
@@ -0,0 +1,57 @@
+package org.neo4j.graphalgo.similarity;
+
+import org.junit.Test;
+import org.neo4j.graphdb.Node;
+import org.neo4j.helpers.collection.MapUtil;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import static org.hamcrest.Matchers.is;
+import static org.junit.Assert.*;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+import static org.neo4j.graphalgo.similarity.SimilarityVectorAggregator.CATEGORY_KEY;
+import static org.neo4j.graphalgo.similarity.SimilarityVectorAggregator.WEIGHT_KEY;
+
+public class SimilarityVectorAggregatorTest {
+
+    @Test
+    public void singleItem() {
+        SimilarityVectorAggregator aggregator = new SimilarityVectorAggregator();
+
+        Node node = mock(Node.class);
+        when(node.getId()).thenReturn(1L);
+
+        aggregator.next(node, 3.0);
+
+        List<Map<String, Object>> expected = Collections.singletonList(
+                MapUtil.map(CATEGORY_KEY, 1L, WEIGHT_KEY, 3.0)
+        );
+
+        assertThat(aggregator.result(), is(expected));
+    }
+
+    @Test
+    public void multipleItems() {
+        SimilarityVectorAggregator aggregator = new SimilarityVectorAggregator();
+
+        Node node = mock(Node.class);
+        when(node.getId()).thenReturn(1L, 2L, 3L);
+
+        aggregator.next(node, 3.0);
+        aggregator.next(node, 2.0);
+        aggregator.next(node, 1.0);
+
+        List<Map<String, Object>> expected = Arrays.asList(
+                MapUtil.map(CATEGORY_KEY, 1L, WEIGHT_KEY, 3.0),
+                MapUtil.map(CATEGORY_KEY, 2L, WEIGHT_KEY, 2.0),
+                MapUtil.map(CATEGORY_KEY, 3L, WEIGHT_KEY, 1.0)
+        );
+
+        assertThat(aggregator.result(), is(expected));
+    }
+
+}
diff --git a/core/src/main/java/org/neo4j/graphalgo/core/utils/Intersections.java b/core/src/main/java/org/neo4j/graphalgo/core/utils/Intersections.java
@@ -200,7 +200,8 @@ public static double pearson(double[] vector1, double[] vector2, int len) {
             yLength += vector2Delta * vector2Delta;
         }
 
-        return dotProductMinusMean / Math.sqrt(xLength * yLength);
+        double result = dotProductMinusMean / Math.sqrt(xLength * yLength);
+        return Double.isNaN(result) ? 0 : result;
     }
 
     public static double pearsonSkip(double[] vector1, double[] vector2, int len, double skipValue) {
@@ -246,7 +247,8 @@ public static double pearsonSkip(double[] vector1, double[] vector2, int len, do
             yLength += vector2Delta * vector2Delta;
         }
 
-        return dotProductMinusMean / Math.sqrt(xLength * yLength);
+        double result = dotProductMinusMean / Math.sqrt(xLength * yLength);
+        return Double.isNaN(result) ? 0 : result;
     }
 
     private static boolean shouldSkip(double weight, double skipValue, boolean skipNan) {
diff --git a/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/PearsonSimilarityTest.java b/tests/src/test/java/org/neo4j/graphalgo/algo/similarity/PearsonSimilarityTest.java

Original file line number	Diff line number	Diff line change
`@@ -200,7 +200,8 @@ public static double pearson(double[] vector1, double[] vector2, int len) {`
`200`	`200`	`yLength += vector2Delta * vector2Delta;`
`201`	`201`	`}`
`202`	`202`
`203`		`- return dotProductMinusMean / Math.sqrt(xLength * yLength);`
	`203`	`+ double result = dotProductMinusMean / Math.sqrt(xLength * yLength);`
	`204`	`+ return Double.isNaN(result) ? 0 : result;`
`204`	`205`	`}`
`205`	`206`
`206`	`207`	`public static double pearsonSkip(double[] vector1, double[] vector2, int len, double skipValue) {`
`@@ -246,7 +247,8 @@ public static double pearsonSkip(double[] vector1, double[] vector2, int len, do`
`246`	`247`	`yLength += vector2Delta * vector2Delta;`
`247`	`248`	`}`
`248`	`249`
`249`		`- return dotProductMinusMean / Math.sqrt(xLength * yLength);`
	`250`	`+ double result = dotProductMinusMean / Math.sqrt(xLength * yLength);`
	`251`	`+ return Double.isNaN(result) ? 0 : result;`
`250`	`252`	`}`
`251`	`253`
`252`	`254`	`private static boolean shouldSkip(double weight, double skipValue, boolean skipNan) {`