1818 */
1919package org .neo4j .graphalgo .similarity ;
2020
21- import org .neo4j .procedure .Description ;
22- import org .neo4j .procedure .Name ;
23- import org .neo4j .procedure .UserFunction ;
24-
21+ import com .carrotsearch .hppc .LongDoubleHashMap ;
22+ import com .carrotsearch .hppc .LongDoubleMap ;
23+ import com .carrotsearch .hppc .LongHashSet ;
24+ import com .carrotsearch .hppc .LongSet ;
25+ import com .carrotsearch .hppc .cursors .LongCursor ;
26+ import org .neo4j .graphalgo .core .ProcedureConfiguration ;
27+ import org .neo4j .graphalgo .core .utils .Intersections ;
28+ import org .neo4j .procedure .*;
29+
30+ import java .util .HashMap ;
2531import java .util .HashSet ;
2632import java .util .List ;
33+ import java .util .Map ;
34+
35+ import static org .neo4j .graphalgo .similarity .SimilarityVectorAggregator .CATEGORY_KEY ;
36+ import static org .neo4j .graphalgo .similarity .SimilarityVectorAggregator .WEIGHT_KEY ;
2737
2838public class Similarities {
2939
@@ -49,51 +59,83 @@ public double cosineSimilarity(@Name("vector1") List<Number> vector1, @Name("vec
4959 throw new RuntimeException ("Vectors must be non-empty and of the same size" );
5060 }
5161
52- double dotProduct = 0d ;
53- double xLength = 0d ;
54- double yLength = 0d ;
55- for (int i = 0 ; i < vector1 .size (); i ++) {
56- double weight1 = vector1 .get (i ).doubleValue ();
57- double weight2 = vector2 .get (i ).doubleValue ();
62+ int len = Math .min (vector1 .size (), vector2 .size ());
63+ double [] weights1 = new double [len ];
64+ double [] weights2 = new double [len ];
5865
59- dotProduct += weight1 * weight2 ;
60- xLength += weight1 * weight1 ;
61- yLength += weight2 * weight2 ;
66+ for ( int i = 0 ; i < len ; i ++) {
67+ weights1 [ i ] = vector1 . get ( i ). doubleValue () ;
68+ weights2 [ i ] = vector2 . get ( i ). doubleValue () ;
6269 }
6370
64- xLength = Math .sqrt (xLength );
65- yLength = Math . sqrt ( yLength );
71+ return Math .sqrt (Intersections . cosineSquare ( weights1 , weights2 , len ) );
72+ }
6673
67- return dotProduct / (xLength * yLength );
74+ @ UserAggregationFunction ("algo.similarity.asVector" )
75+ @ Description ("algo.similarity.asVector - builds a vector of maps containing items and weights" )
76+ public SimilarityVectorAggregator asVector () {
77+ return new SimilarityVectorAggregator ();
6878 }
6979
7080 @ UserFunction ("algo.similarity.pearson" )
7181 @ Description ("algo.similarity.pearson([vector1], [vector2]) " +
7282 "given two collection vectors, calculate pearson similarity" )
73- public double pearsonSimilarity (@ Name ("vector1" ) List <Number > vector1 , @ Name ("vector2" ) List <Number > vector2 ) {
74- if (vector1 .size () != vector2 .size () || vector1 .size () == 0 ) {
75- throw new RuntimeException ("Vectors must be non-empty and of the same size" );
83+ public double pearsonSimilarity (@ Name ("vector1" ) Object rawVector1 , @ Name ("vector2" ) Object rawVector2 , @ Name (value = "config" , defaultValue = "{}" ) Map <String , Object > config ) {
84+ ProcedureConfiguration configuration = ProcedureConfiguration .create (config );
85+
86+ String listType = configuration .get ("vectorType" , "numbers" );
87+
88+ if (listType .equalsIgnoreCase ("maps" )) {
89+ List <Map <String , Object >> vector1 = (List <Map <String , Object >>) rawVector1 ;
90+ List <Map <String , Object >> vector2 = (List <Map <String , Object >>) rawVector2 ;
91+
92+ LongSet ids = new LongHashSet ();
93+
94+ LongDoubleMap v1Mappings = new LongDoubleHashMap ();
95+ for (Map <String , Object > entry : vector1 ) {
96+ Long id = (Long ) entry .get (CATEGORY_KEY );
97+ ids .add (id );
98+ v1Mappings .put (id , (Double ) entry .get (WEIGHT_KEY ));
99+ }
100+
101+ LongDoubleMap v2Mappings = new LongDoubleHashMap ();
102+ for (Map <String , Object > entry : vector2 ) {
103+ Long id = (Long ) entry .get (CATEGORY_KEY );
104+ ids .add (id );
105+ v2Mappings .put (id , (Double ) entry .get (WEIGHT_KEY ));
106+ }
107+
108+ double [] weights1 = new double [ids .size ()];
109+ double [] weights2 = new double [ids .size ()];
110+
111+ double skipValue = Double .NaN ;
112+ int index = 0 ;
113+ for (long id : ids .toArray ()) {
114+ weights1 [index ] = v1Mappings .getOrDefault (id , skipValue );
115+ weights2 [index ] = v2Mappings .getOrDefault (id , skipValue );
116+ index ++;
117+ }
118+
119+ return Intersections .pearsonSkip (weights1 , weights2 , ids .size (), skipValue );
120+ } else {
121+ List <Number > vector1 = (List <Number >) rawVector1 ;
122+ List <Number > vector2 = (List <Number >) rawVector2 ;
123+
124+ if (vector1 .size () != vector2 .size () || vector1 .size () == 0 ) {
125+ throw new RuntimeException ("Vectors must be non-empty and of the same size" );
126+ }
127+
128+ int len = vector1 .size ();
129+ double [] weights1 = new double [len ];
130+ double [] weights2 = new double [len ];
131+
132+ for (int i = 0 ; i < len ; i ++) {
133+ weights1 [i ] = vector1 .get (i ).doubleValue ();
134+ weights2 [i ] = vector2 .get (i ).doubleValue ();
135+ }
136+ return Intersections .pearson (weights1 , weights2 , len );
76137 }
77138
78- double vector1Mean = vector1 .stream ().mapToDouble (Number ::doubleValue ).average ().orElse (1 );
79- double vector2Mean = vector2 .stream ().mapToDouble (Number ::doubleValue ).average ().orElse (1 );
80-
81- double dotProductMinusMean = 0d ;
82- double xLength = 0d ;
83- double yLength = 0d ;
84- for (int i = 0 ; i < vector1 .size (); i ++) {
85- double weight1 = vector1 .get (i ).doubleValue ();
86- double weight2 = vector2 .get (i ).doubleValue ();
87-
88- double vector1Delta = weight1 - vector1Mean ;
89- double vector2Delta = weight2 - vector2Mean ;
90-
91- dotProductMinusMean += (vector1Delta * vector2Delta );
92- xLength += vector1Delta * vector1Delta ;
93- yLength += vector2Delta * vector2Delta ;
94- }
95-
96- return dotProductMinusMean / (Math .sqrt (xLength * yLength ));
97139 }
98140
99141 @ UserFunction ("algo.similarity.euclideanDistance" )
@@ -104,15 +146,16 @@ public double euclideanDistance(@Name("vector1") List<Number> vector1, @Name("ve
104146 throw new RuntimeException ("Vectors must be non-empty and of the same size" );
105147 }
106148
107- double distance = 0.0 ;
108- for (int i = 0 ; i < vector1 .size (); i ++) {
109- double sqOfDiff = vector1 .get (i ).doubleValue () - vector2 .get (i ).doubleValue ();
110- sqOfDiff *= sqOfDiff ;
111- distance += sqOfDiff ;
149+ int len = Math .min (vector1 .size (), vector2 .size ());
150+ double [] weights1 = new double [len ];
151+ double [] weights2 = new double [len ];
152+
153+ for (int i = 0 ; i < len ; i ++) {
154+ weights1 [i ] = vector1 .get (i ).doubleValue ();
155+ weights2 [i ] = vector2 .get (i ).doubleValue ();
112156 }
113- distance = Math .sqrt (distance );
114157
115- return distance ;
158+ return Math . sqrt ( Intersections . sumSquareDelta ( weights1 , weights2 , len )) ;
116159 }
117160
118161 @ UserFunction ("algo.similarity.euclidean" )
0 commit comments