debug of the joins

Gmousse · Gmousse · commit 5758f80a12a3 · 2016-09-18T21:39:05.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -27,7 +27,7 @@
 - Debbugging DataFrame.sortBy() when used with string.
 - DataFrame.distinct() now throw correctly NoSuchColumnError when passing an incorrect columnName.
 - Adding Error messages on Row, DataFrame, GroupedDataFrame and modules...
-- Refactoring join methods.
+- join methods are compeltely revisited, providing a result near from sql. Moreover you can join on multiple columns.
 - Adding unit tests to cover more cases.
 - Unit tests are now realized on es5 compiled version.
 - Clarifying error messages.
diff --git a/examples/titanic_analysis.js b/examples/titanic_analysis.js
@@ -3,7 +3,7 @@
 
 // Here we import the lib.
 // You can also: import { DataFrame } from 'dataframe-js';
-const DataFrame = require('dataframe-js').DataFrame;
+const DataFrame = require('../src/index.js').DataFrame;
 
 // Here we load the titanic data set from the well known Rdatasets (http://vincentarelbundock.github.io/Rdatasets/datasets.html).
 // We get the result via a Promise, as a new DataFrame. We rename it 'df'.
@@ -33,19 +33,30 @@ DataFrame.fromCSV('http://vincentarelbundock.github.io/Rdatasets/csv/COUNT/titan
         // [ 'id', 'Class', 'Sex', 'Age', 'Survived', 'Freq' ]
 
         // Now, our DataFrame is 'clean' with. Let's go to a quick analysis.
-        console.log(cleanDF.count()); // We have 1316 passengers in the Titanic.
-        console.log(cleanDF.filter({survived: 'yes'}).count()); // We have 499 survivors.
-        console.log(cleanDF.filter(row => row.get('survived') === 'no').count()); // and 817 died passengers.
+        console.log('Total passengers:', cleanDF.count()); // We have 1316 passengers in the Titanic.
+        console.log('Survivors:', cleanDF.filter({survived: 'yes'}).count()); // We have 499 survivors.
+        console.log('Died:', cleanDF.filter(row => row.get('survived') === 'no').count()); // and 817 died passengers.
 
-        // Ok now we will count the number of passengers by class + age + sex by using groupBy and aggregation.
+        // Ok now we will count the number of passengers by class + age + sex + survived by using groupBy and aggregation.
         const countByGroup = cleanDF.groupBy('class', 'age', 'sex', 'survived').aggregate(group => group.count());
-        // Ok, now we can see the repartition of passengers by class + age + sex.
+        // We can quicly get some statistics on our DataFrame by using the stat module.
+        console.log('Some statistics on groups:', countByGroup.stat.stats('aggregation'));
+        // { sum: 1316,
+        //   mean: 65.8,
+        //   min: 1,
+        //   max: 387,
+        //   var: 7998.063157894738,
+        //   varpop: 7598.160000000001,
+        //   sd: 89.43189116805446,
+        //   sdpop: 87.1674251082364 }
+
+        // Ok, now we have the repartition of passengers by class + age + sex + survived.
         // But it could be easier to read if we rename the aggregation and sort rows by count.
-        const sortCountByGroup = countByGroup.rename('aggregation', 'count');
+        const cleanCountByGroup = countByGroup.rename('aggregation', 'count').sortBy('count', true);
 
         // And now show the result
-        sortCountByGroup.show();
-        // | class     | age       | sex       | survived  | aggreg... |
+        cleanCountByGroup.show(100);
+        // | class     | age       | sex       | survived  | count     |
         // ------------------------------------------------------------
         // | 3rd class | adults    | man       | no        | 387       |
         // | 2nd class | adults    | man       | no        | 154       |
@@ -58,7 +69,16 @@ DataFrame.fromCSV('http://vincentarelbundock.github.io/Rdatasets/csv/COUNT/titan
         // | 1st class | adults    | man       | yes       | 57        |
         // | 3rd class | child     | man       | no        | 35        |
 
+        // OK, if we just look at this table, we can see that rich people (1s Class), and more specifically women have the largest number of survivors.
 
-
+        // To resume this fact, it could be interesting to compute the % of survival for each group of passengers.
+        // We can do this by this way:
+        // First we compute the number of survivors by class + age + sex.
+        const countByPassengersType = cleanDF.groupBy('class', 'age', 'sex').aggregate(group => group.count());
+        countByPassengersType.show()
+        // Then we have to join with the cleanCountByGroup table.
+        cleanCountByGroup.innerJoin(countByPassengersType, 'class', 'age', 'sex').show(100);
     }
-);
+).catch(err => {
+    console.log(err);
+});
diff --git a/src/dataframe.js b/src/dataframe.js
@@ -176,9 +176,10 @@ class DataFrame {
 
     _joinByType(gdf1, gdf2, type) {
         if (type === 'out' || type === 'in') {
-            const gdf2Groups = gdf2.listGroups().map(groupKey => Object.values(groupKey)[0]);
-            return gdf1.toCollection().map(({group, groupKey}) => {
-                const isContained = gdf2Groups.includes(Object.values(groupKey)[0]);
+            const gdf2Hashs = gdf2.listHashs();
+            return gdf1.toCollection().map(({group, hash}) => {
+                const isContained = gdf2Hashs.includes(hash);
+                console.log(gdf2.get(hash).group.listColumns());
                 const filterCondition = (bool) => bool ? group : false;
                 return type === 'out' ? filterCondition(!isContained) : filterCondition(isContained);
             }).filter(group => group);
@@ -188,9 +189,9 @@ class DataFrame {
 
     _join(dfToJoin, on, types) {
         const newColumns = [...new Set([...this.listColumns(), ...dfToJoin.listColumns()])];
-        const gdf = this.groupBy(on);
-        const gdfToJoin = dfToJoin.groupBy(on);
-        return [...iter([
+        const gdf = this.groupBy(...on);
+        const gdfToJoin = dfToJoin.groupBy(...on);
+        return [this.__newInstance__([], newColumns), ...iter([
             ...this._joinByType(gdf, gdfToJoin, types[0]),
             ...this._joinByType(gdfToJoin, gdf, types[1]),
         ], group => group.restructure(newColumns))].reduce((p, n) => p.union(n));
@@ -733,6 +734,7 @@ class DataFrame {
         return this.__newInstance__([...this, ...dfToUnion], this[__columns__]);
     }
 
+    @checktypes('String', 'DataFrame')
     /**
      * Join two DataFrames.
      * @param {DataFrame} dfToJoin The DataFrame to join.
@@ -742,13 +744,13 @@ class DataFrame {
      * @example
      * df.join(df2, 'column1', 'full')
      */
-    join(dfToJoin, on, how = 'inner') {
+    join(how, dfToJoin, ...on) {
         const joinMethods = {
-            inner: () => this.innerJoin(dfToJoin, on),
-            full: () => this.fullJoin(dfToJoin, on),
-            outer: () => this.outerJoin(dfToJoin, on),
-            left: () => this.leftJoin(dfToJoin, on),
-            right: () => this.rightJoin(dfToJoin, on),
+            inner: () => this.innerJoin(dfToJoin, ...on),
+            full: () => this.fullJoin(dfToJoin, ...on),
+            outer: () => this.outerJoin(dfToJoin, ...on),
+            left: () => this.leftJoin(dfToJoin, ...on),
+            right: () => this.rightJoin(dfToJoin, ...on),
         };
         return joinMethods[how]();
     }
@@ -763,7 +765,7 @@ class DataFrame {
      * df.join(df2, 'id')
      * df.join(df2, 'id', 'inner')
      */
-    innerJoin(dfToJoin, on) {
+    innerJoin(dfToJoin, ...on) {
         return this._join(dfToJoin, on, ['in', 'in']);
     }
 
@@ -776,7 +778,7 @@ class DataFrame {
      * df.fullJoin(df2, 'id')
      * df.join(df2, 'id', 'full')
      */
-    fullJoin(dfToJoin, on) {
+    fullJoin(dfToJoin, ...on) {
         return this._join(dfToJoin, on, ['', '']);
     }
 
@@ -789,7 +791,7 @@ class DataFrame {
      * df2.rightJoin(df2, 'id')
      * df2.join(df2, 'id', 'outer')
      */
-    outerJoin(dfToJoin, on) {
+    outerJoin(dfToJoin, ...on) {
         return this._join(dfToJoin, on, ['out', 'out']);
     }
 
@@ -802,7 +804,7 @@ class DataFrame {
      * df.leftJoin(df2, 'id')
      * df.join(df2, 'id', 'left')
      */
-    leftJoin(dfToJoin, on) {
+    leftJoin(dfToJoin, ...on) {
         return this._join(dfToJoin, on, ['', 'in']);
     }
 
@@ -815,7 +817,7 @@ class DataFrame {
      * df.rightJoin(df2, 'id')
      * df.join(df2, 'id', 'right')
      */
-    rightJoin(dfToJoin, on) {
+    rightJoin(dfToJoin, ...on) {
         return this._join(dfToJoin, on, ['in', '']);
     }
 }
diff --git a/src/groupedDataframe.js b/src/groupedDataframe.js
@@ -1,6 +1,6 @@
 import { checktypes } from 'es7-checktypes-decorator';
 
-import { combine } from './reusables.js';
+import { combine, hashCode } from './reusables.js';
 
 const __groups__ = Symbol('groups');
 
@@ -30,13 +30,18 @@ export default class GroupedDataFrame {
         }
     }
 
+    __hashKey__(groupKey) {
+        return hashCode(Object.entries(groupKey).reduce((p, n) => [...p, ...n]).join(''));
+    }
+
     @checktypes('DataFrame', Array)
     _groupBy(df, columnNames) {
         return combine(columnNames.map((column) => df.distinct(column).toArray(column))).map(
             combination => {
                 const groupKey = Object.assign({}, ...combination.map((column, i) => ({[columnNames[i]]: column})));
                 return ({
                     groupKey,
+                    hash: this.__hashKey__(groupKey),
                     group: df.filter(
                         (row) => Object.entries(groupKey).reduce((p, n) => p && Object.is(row.get(n[0]), n[1]), true)
                     ),
@@ -45,6 +50,10 @@ export default class GroupedDataFrame {
         ).filter(({group}) => group.count() > 0);
     }
 
+    get(hash) {
+        return this.toCollection().find(group => group.hash === hash);
+    }
+
     /**
      * Convert GroupedDataFrame into collection (Array) of dictionnaries (Object).
      * @returns {Array} An Array containing group: {groupKey, group}.
@@ -82,6 +91,16 @@ export default class GroupedDataFrame {
         return [...this].map(({groupKey}) => groupKey);
     }
 
+    /**
+     * List GroupedDataFrame groups as a hashCode.
+     * @returns {Array} An Array containing GroupedDataFrame hash codes.
+     * @example
+     * gdf.listHashCodes()
+     */
+    listHashs() {
+        return [...this].map(({hash}) => hash);
+    }
+
     @checktypes('Function')
     /**
      * Create an aggregation from a function.
diff --git a/src/reusables.js b/src/reusables.js
@@ -94,3 +94,15 @@ export function compare(firstElem, secondElem, reverse = false) {
     }
     return 0;
 }
+
+export function hashCode(str) {
+    let hash = 0;
+    let char;
+    if (str.length === 0) return hash;
+    for (let i = 0; i < str.length; i++) {
+        char = str.charCodeAt(i);
+        hash = ((hash << 5) - hash) + char;
+        hash = hash & hash;
+    }
+    return hash;
+}
diff --git a/src/sqlEngine.js b/src/sqlEngine.js
@@ -32,8 +32,9 @@ const SELECT_FUNCTIONS = {
 
 function joinHandler(operation, tables, type) {
     return (df) => df.join(
+        type,
         tables[operation[0]],
-        operation[operation.findIndex(word => word.toUpperCase() === 'ON') + 1], type
+        operation[operation.findIndex(word => word.toUpperCase() === 'ON') + 1]
     );
 }
 
diff --git a/tests/dataframe-test.js b/tests/dataframe-test.js
@@ -582,11 +582,11 @@ test('DataFrame rows can be ', (assert) => {
 
     const df6 = new DataFrame({
         id: [2, 1, 6, 8, 3],
-        value: [1, 0, 1, 2, 6],
+        value2: [1, 0, 1, 2, 6],
     }, ['id', 'value2']);
 
     assert.deepEqual(
-        df5.join(df6, 'id', 'inner').sortBy('id').toArray(), [
+        df5.innerJoin(df6, 'id').sortBy('id').toArray(), [
             [1, 0, undefined],
             [1, undefined, 0],
             [3, 1, undefined],
@@ -597,7 +597,7 @@ test('DataFrame rows can be ', (assert) => {
     );
 
     assert.deepEqual(
-        df5.join(df6, 'id', 'full').sortBy('id').toArray(), [
+        df5.fullJoin(df6, 'id').sortBy('id').toArray(), [
             [1, 0, undefined],
             [1, undefined, 0],
             [2, undefined, 1],
@@ -609,14 +609,14 @@ test('DataFrame rows can be ', (assert) => {
         ], 'full joined.'
     );
     assert.deepEqual(
-        df5.join(df6, 'id', 'outer').sortBy('id').toArray(), [
+        df5.outerJoin(df6, 'id').sortBy('id').toArray(), [
             [2, undefined, 1],
             [6, undefined, 1],
         ], 'outer joined.'
     );
 
     assert.deepEqual(
-        df5.join(df6, 'id', 'left').sortBy('id').toArray(), [
+        df5.leftJoin(df6, 'id').sortBy('id').toArray(), [
             [1, 0, undefined],
             [1, undefined, 0],
             [3, 1, undefined],
@@ -627,7 +627,7 @@ test('DataFrame rows can be ', (assert) => {
     );
 
     assert.deepEqual(
-        df5.join(df6, 'id', 'right').sortBy('id').toArray(), [
+        df5.rightJoin(df6, 'id').sortBy('id').toArray(), [
             [1, 0, undefined],
             [1, undefined, 0],
             [2, undefined, 1],
@@ -639,6 +639,19 @@ test('DataFrame rows can be ', (assert) => {
         ], 'right joined.'
     );
 
+    assert.deepEqual(
+        df6.innerJoin(new DataFrame({
+            id: [2, 1, 6, 8, 3],
+            value2: [2, 0, 4, 3, 6],
+            value3: [1, 0, 1, 2, 6],
+        }, ['id', 'value2', 'value3']), 'id', 'value2').sortBy('id').toArray(), [
+            [1, 0, undefined],
+            [1, 0, 0],
+            [3, 6, undefined],
+            [3, 6, 6],
+        ], 'joined on multiple columns.'
+    );
+
     const df7 = new DataFrame([...Array(20).keys()].map(row => [row]), ['c1']);
 
     assert.isNotDeepEqual(
@@ -684,7 +697,7 @@ test('DataFrame rows can\'t be ', (assert) => {
     );
 
     assert.equal(
-        tryCatch(() => new DataFrame([{c1: 1, c2: 3}]).join([])).name,
+        tryCatch(() => new DataFrame([{c1: 1, c2: 3}]).innerJoin([])).name,
         'TypeError',
         'joined with not a DataFrame, throwing TypeError.'
     );
diff --git a/tests/groupedDataFrame-test.js b/tests/groupedDataFrame-test.js
@@ -33,12 +33,12 @@ test('GroupedDataFrame can be ', (assert) => {
     );
 
     assert.deepEqual(new GroupedDataFrame(df, 'column1').toCollection().map(x => Object.keys(x)),
-        [['groupKey', 'group'], ['groupKey', 'group'], ['groupKey', 'group']],
+        [['groupKey', 'hash', 'group'], ['groupKey', 'hash', 'group'], ['groupKey', 'hash', 'group']],
         'converted into a collection of dictionnaries containing each group.'
     );
 
     assert.deepEqual([...new GroupedDataFrame(df, 'column1')].map(x => Object.keys(x)),
-        [['groupKey', 'group'], ['groupKey', 'group'], ['groupKey', 'group']],
+        [['groupKey', 'hash', 'group'], ['groupKey', 'hash', 'group'], ['groupKey', 'hash', 'group']],
         'converted into a collection of dictionnaries containing each group when destructured.'
     );
 
diff --git a/tests/sql-test.js b/tests/sql-test.js
@@ -79,13 +79,13 @@ test('DataFrame sql module can ', (assert) => {
 
     assert.deepEqual(
         DataFrame.sql.request('SELECT * FROM tmp JOIN tmp2 ON id').toDict(),
-        df1.join(df2, 'id').toDict(),
+        df1.innerJoin(df2, 'id').toDict(),
         'select everything from a join (inner) between 2 tables.'
     );
 
     assert.deepEqual(
         DataFrame.sql.request('SELECT * FROM tmp JOIN tmp2 ON id WHERE column1 != undefined').toDict(),
-        df1.join(df2, 'id').filter(row => row.get('column1') !== undefined).toDict(),
+        df1.innerJoin(df2, 'id').filter(row => row.get('column1') !== undefined).toDict(),
         'select everything from a join chained with a filter.'
     );
 

Original file line number	Diff line number	Diff line change
`@@ -32,8 +32,9 @@ const SELECT_FUNCTIONS = {`
`32`	`32`
`33`	`33`	`function joinHandler(operation, tables, type) {`
`34`	`34`	`return (df) => df.join(`
	`35`	`+ type,`
`35`	`36`	`tables[operation[0]],`
`36`		`- operation[operation.findIndex(word => word.toUpperCase() === 'ON') + 1], type`
	`37`	`+ operation[operation.findIndex(word => word.toUpperCase() === 'ON') + 1]`
`37`	`38`	`);`
`38`	`39`	`}`
`39`	`40`