Skip to content
This repository was archived by the owner on Aug 17, 2024. It is now read-only.

Commit 5758f80

Browse files
committed
debug of the joins
1 parent 4836357 commit 5758f80

File tree

9 files changed

+109
-42
lines changed

9 files changed

+109
-42
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
- Debbugging DataFrame.sortBy() when used with string.
2828
- DataFrame.distinct() now throw correctly NoSuchColumnError when passing an incorrect columnName.
2929
- Adding Error messages on Row, DataFrame, GroupedDataFrame and modules...
30-
- Refactoring join methods.
30+
- join methods are compeltely revisited, providing a result near from sql. Moreover you can join on multiple columns.
3131
- Adding unit tests to cover more cases.
3232
- Unit tests are now realized on es5 compiled version.
3333
- Clarifying error messages.

examples/titanic_analysis.js

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
// Here we import the lib.
55
// You can also: import { DataFrame } from 'dataframe-js';
6-
const DataFrame = require('dataframe-js').DataFrame;
6+
const DataFrame = require('../src/index.js').DataFrame;
77

88
// Here we load the titanic data set from the well known Rdatasets (http://vincentarelbundock.github.io/Rdatasets/datasets.html).
99
// We get the result via a Promise, as a new DataFrame. We rename it 'df'.
@@ -33,19 +33,30 @@ DataFrame.fromCSV('http://vincentarelbundock.github.io/Rdatasets/csv/COUNT/titan
3333
// [ 'id', 'Class', 'Sex', 'Age', 'Survived', 'Freq' ]
3434

3535
// Now, our DataFrame is 'clean' with. Let's go to a quick analysis.
36-
console.log(cleanDF.count()); // We have 1316 passengers in the Titanic.
37-
console.log(cleanDF.filter({survived: 'yes'}).count()); // We have 499 survivors.
38-
console.log(cleanDF.filter(row => row.get('survived') === 'no').count()); // and 817 died passengers.
36+
console.log('Total passengers:', cleanDF.count()); // We have 1316 passengers in the Titanic.
37+
console.log('Survivors:', cleanDF.filter({survived: 'yes'}).count()); // We have 499 survivors.
38+
console.log('Died:', cleanDF.filter(row => row.get('survived') === 'no').count()); // and 817 died passengers.
3939

40-
// Ok now we will count the number of passengers by class + age + sex by using groupBy and aggregation.
40+
// Ok now we will count the number of passengers by class + age + sex + survived by using groupBy and aggregation.
4141
const countByGroup = cleanDF.groupBy('class', 'age', 'sex', 'survived').aggregate(group => group.count());
42-
// Ok, now we can see the repartition of passengers by class + age + sex.
42+
// We can quicly get some statistics on our DataFrame by using the stat module.
43+
console.log('Some statistics on groups:', countByGroup.stat.stats('aggregation'));
44+
// { sum: 1316,
45+
// mean: 65.8,
46+
// min: 1,
47+
// max: 387,
48+
// var: 7998.063157894738,
49+
// varpop: 7598.160000000001,
50+
// sd: 89.43189116805446,
51+
// sdpop: 87.1674251082364 }
52+
53+
// Ok, now we have the repartition of passengers by class + age + sex + survived.
4354
// But it could be easier to read if we rename the aggregation and sort rows by count.
44-
const sortCountByGroup = countByGroup.rename('aggregation', 'count');
55+
const cleanCountByGroup = countByGroup.rename('aggregation', 'count').sortBy('count', true);
4556

4657
// And now show the result
47-
sortCountByGroup.show();
48-
// | class | age | sex | survived | aggreg... |
58+
cleanCountByGroup.show(100);
59+
// | class | age | sex | survived | count |
4960
// ------------------------------------------------------------
5061
// | 3rd class | adults | man | no | 387 |
5162
// | 2nd class | adults | man | no | 154 |
@@ -58,7 +69,16 @@ DataFrame.fromCSV('http://vincentarelbundock.github.io/Rdatasets/csv/COUNT/titan
5869
// | 1st class | adults | man | yes | 57 |
5970
// | 3rd class | child | man | no | 35 |
6071

72+
// OK, if we just look at this table, we can see that rich people (1s Class), and more specifically women have the largest number of survivors.
6173

62-
74+
// To resume this fact, it could be interesting to compute the % of survival for each group of passengers.
75+
// We can do this by this way:
76+
// First we compute the number of survivors by class + age + sex.
77+
const countByPassengersType = cleanDF.groupBy('class', 'age', 'sex').aggregate(group => group.count());
78+
countByPassengersType.show()
79+
// Then we have to join with the cleanCountByGroup table.
80+
cleanCountByGroup.innerJoin(countByPassengersType, 'class', 'age', 'sex').show(100);
6381
}
64-
);
82+
).catch(err => {
83+
console.log(err);
84+
});

src/dataframe.js

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -176,9 +176,10 @@ class DataFrame {
176176

177177
_joinByType(gdf1, gdf2, type) {
178178
if (type === 'out' || type === 'in') {
179-
const gdf2Groups = gdf2.listGroups().map(groupKey => Object.values(groupKey)[0]);
180-
return gdf1.toCollection().map(({group, groupKey}) => {
181-
const isContained = gdf2Groups.includes(Object.values(groupKey)[0]);
179+
const gdf2Hashs = gdf2.listHashs();
180+
return gdf1.toCollection().map(({group, hash}) => {
181+
const isContained = gdf2Hashs.includes(hash);
182+
console.log(gdf2.get(hash).group.listColumns());
182183
const filterCondition = (bool) => bool ? group : false;
183184
return type === 'out' ? filterCondition(!isContained) : filterCondition(isContained);
184185
}).filter(group => group);
@@ -188,9 +189,9 @@ class DataFrame {
188189

189190
_join(dfToJoin, on, types) {
190191
const newColumns = [...new Set([...this.listColumns(), ...dfToJoin.listColumns()])];
191-
const gdf = this.groupBy(on);
192-
const gdfToJoin = dfToJoin.groupBy(on);
193-
return [...iter([
192+
const gdf = this.groupBy(...on);
193+
const gdfToJoin = dfToJoin.groupBy(...on);
194+
return [this.__newInstance__([], newColumns), ...iter([
194195
...this._joinByType(gdf, gdfToJoin, types[0]),
195196
...this._joinByType(gdfToJoin, gdf, types[1]),
196197
], group => group.restructure(newColumns))].reduce((p, n) => p.union(n));
@@ -733,6 +734,7 @@ class DataFrame {
733734
return this.__newInstance__([...this, ...dfToUnion], this[__columns__]);
734735
}
735736

737+
@checktypes('String', 'DataFrame')
736738
/**
737739
* Join two DataFrames.
738740
* @param {DataFrame} dfToJoin The DataFrame to join.
@@ -742,13 +744,13 @@ class DataFrame {
742744
* @example
743745
* df.join(df2, 'column1', 'full')
744746
*/
745-
join(dfToJoin, on, how = 'inner') {
747+
join(how, dfToJoin, ...on) {
746748
const joinMethods = {
747-
inner: () => this.innerJoin(dfToJoin, on),
748-
full: () => this.fullJoin(dfToJoin, on),
749-
outer: () => this.outerJoin(dfToJoin, on),
750-
left: () => this.leftJoin(dfToJoin, on),
751-
right: () => this.rightJoin(dfToJoin, on),
749+
inner: () => this.innerJoin(dfToJoin, ...on),
750+
full: () => this.fullJoin(dfToJoin, ...on),
751+
outer: () => this.outerJoin(dfToJoin, ...on),
752+
left: () => this.leftJoin(dfToJoin, ...on),
753+
right: () => this.rightJoin(dfToJoin, ...on),
752754
};
753755
return joinMethods[how]();
754756
}
@@ -763,7 +765,7 @@ class DataFrame {
763765
* df.join(df2, 'id')
764766
* df.join(df2, 'id', 'inner')
765767
*/
766-
innerJoin(dfToJoin, on) {
768+
innerJoin(dfToJoin, ...on) {
767769
return this._join(dfToJoin, on, ['in', 'in']);
768770
}
769771

@@ -776,7 +778,7 @@ class DataFrame {
776778
* df.fullJoin(df2, 'id')
777779
* df.join(df2, 'id', 'full')
778780
*/
779-
fullJoin(dfToJoin, on) {
781+
fullJoin(dfToJoin, ...on) {
780782
return this._join(dfToJoin, on, ['', '']);
781783
}
782784

@@ -789,7 +791,7 @@ class DataFrame {
789791
* df2.rightJoin(df2, 'id')
790792
* df2.join(df2, 'id', 'outer')
791793
*/
792-
outerJoin(dfToJoin, on) {
794+
outerJoin(dfToJoin, ...on) {
793795
return this._join(dfToJoin, on, ['out', 'out']);
794796
}
795797

@@ -802,7 +804,7 @@ class DataFrame {
802804
* df.leftJoin(df2, 'id')
803805
* df.join(df2, 'id', 'left')
804806
*/
805-
leftJoin(dfToJoin, on) {
807+
leftJoin(dfToJoin, ...on) {
806808
return this._join(dfToJoin, on, ['', 'in']);
807809
}
808810

@@ -815,7 +817,7 @@ class DataFrame {
815817
* df.rightJoin(df2, 'id')
816818
* df.join(df2, 'id', 'right')
817819
*/
818-
rightJoin(dfToJoin, on) {
820+
rightJoin(dfToJoin, ...on) {
819821
return this._join(dfToJoin, on, ['in', '']);
820822
}
821823
}

src/groupedDataframe.js

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { checktypes } from 'es7-checktypes-decorator';
22

3-
import { combine } from './reusables.js';
3+
import { combine, hashCode } from './reusables.js';
44

55
const __groups__ = Symbol('groups');
66

@@ -30,13 +30,18 @@ export default class GroupedDataFrame {
3030
}
3131
}
3232

33+
__hashKey__(groupKey) {
34+
return hashCode(Object.entries(groupKey).reduce((p, n) => [...p, ...n]).join(''));
35+
}
36+
3337
@checktypes('DataFrame', Array)
3438
_groupBy(df, columnNames) {
3539
return combine(columnNames.map((column) => df.distinct(column).toArray(column))).map(
3640
combination => {
3741
const groupKey = Object.assign({}, ...combination.map((column, i) => ({[columnNames[i]]: column})));
3842
return ({
3943
groupKey,
44+
hash: this.__hashKey__(groupKey),
4045
group: df.filter(
4146
(row) => Object.entries(groupKey).reduce((p, n) => p && Object.is(row.get(n[0]), n[1]), true)
4247
),
@@ -45,6 +50,10 @@ export default class GroupedDataFrame {
4550
).filter(({group}) => group.count() > 0);
4651
}
4752

53+
get(hash) {
54+
return this.toCollection().find(group => group.hash === hash);
55+
}
56+
4857
/**
4958
* Convert GroupedDataFrame into collection (Array) of dictionnaries (Object).
5059
* @returns {Array} An Array containing group: {groupKey, group}.
@@ -82,6 +91,16 @@ export default class GroupedDataFrame {
8291
return [...this].map(({groupKey}) => groupKey);
8392
}
8493

94+
/**
95+
* List GroupedDataFrame groups as a hashCode.
96+
* @returns {Array} An Array containing GroupedDataFrame hash codes.
97+
* @example
98+
* gdf.listHashCodes()
99+
*/
100+
listHashs() {
101+
return [...this].map(({hash}) => hash);
102+
}
103+
85104
@checktypes('Function')
86105
/**
87106
* Create an aggregation from a function.

src/reusables.js

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,3 +94,15 @@ export function compare(firstElem, secondElem, reverse = false) {
9494
}
9595
return 0;
9696
}
97+
98+
export function hashCode(str) {
99+
let hash = 0;
100+
let char;
101+
if (str.length === 0) return hash;
102+
for (let i = 0; i < str.length; i++) {
103+
char = str.charCodeAt(i);
104+
hash = ((hash << 5) - hash) + char;
105+
hash = hash & hash;
106+
}
107+
return hash;
108+
}

src/sqlEngine.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@ const SELECT_FUNCTIONS = {
3232

3333
function joinHandler(operation, tables, type) {
3434
return (df) => df.join(
35+
type,
3536
tables[operation[0]],
36-
operation[operation.findIndex(word => word.toUpperCase() === 'ON') + 1], type
37+
operation[operation.findIndex(word => word.toUpperCase() === 'ON') + 1]
3738
);
3839
}
3940

tests/dataframe-test.js

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -582,11 +582,11 @@ test('DataFrame rows can be ', (assert) => {
582582

583583
const df6 = new DataFrame({
584584
id: [2, 1, 6, 8, 3],
585-
value: [1, 0, 1, 2, 6],
585+
value2: [1, 0, 1, 2, 6],
586586
}, ['id', 'value2']);
587587

588588
assert.deepEqual(
589-
df5.join(df6, 'id', 'inner').sortBy('id').toArray(), [
589+
df5.innerJoin(df6, 'id').sortBy('id').toArray(), [
590590
[1, 0, undefined],
591591
[1, undefined, 0],
592592
[3, 1, undefined],
@@ -597,7 +597,7 @@ test('DataFrame rows can be ', (assert) => {
597597
);
598598

599599
assert.deepEqual(
600-
df5.join(df6, 'id', 'full').sortBy('id').toArray(), [
600+
df5.fullJoin(df6, 'id').sortBy('id').toArray(), [
601601
[1, 0, undefined],
602602
[1, undefined, 0],
603603
[2, undefined, 1],
@@ -609,14 +609,14 @@ test('DataFrame rows can be ', (assert) => {
609609
], 'full joined.'
610610
);
611611
assert.deepEqual(
612-
df5.join(df6, 'id', 'outer').sortBy('id').toArray(), [
612+
df5.outerJoin(df6, 'id').sortBy('id').toArray(), [
613613
[2, undefined, 1],
614614
[6, undefined, 1],
615615
], 'outer joined.'
616616
);
617617

618618
assert.deepEqual(
619-
df5.join(df6, 'id', 'left').sortBy('id').toArray(), [
619+
df5.leftJoin(df6, 'id').sortBy('id').toArray(), [
620620
[1, 0, undefined],
621621
[1, undefined, 0],
622622
[3, 1, undefined],
@@ -627,7 +627,7 @@ test('DataFrame rows can be ', (assert) => {
627627
);
628628

629629
assert.deepEqual(
630-
df5.join(df6, 'id', 'right').sortBy('id').toArray(), [
630+
df5.rightJoin(df6, 'id').sortBy('id').toArray(), [
631631
[1, 0, undefined],
632632
[1, undefined, 0],
633633
[2, undefined, 1],
@@ -639,6 +639,19 @@ test('DataFrame rows can be ', (assert) => {
639639
], 'right joined.'
640640
);
641641

642+
assert.deepEqual(
643+
df6.innerJoin(new DataFrame({
644+
id: [2, 1, 6, 8, 3],
645+
value2: [2, 0, 4, 3, 6],
646+
value3: [1, 0, 1, 2, 6],
647+
}, ['id', 'value2', 'value3']), 'id', 'value2').sortBy('id').toArray(), [
648+
[1, 0, undefined],
649+
[1, 0, 0],
650+
[3, 6, undefined],
651+
[3, 6, 6],
652+
], 'joined on multiple columns.'
653+
);
654+
642655
const df7 = new DataFrame([...Array(20).keys()].map(row => [row]), ['c1']);
643656

644657
assert.isNotDeepEqual(
@@ -684,7 +697,7 @@ test('DataFrame rows can\'t be ', (assert) => {
684697
);
685698

686699
assert.equal(
687-
tryCatch(() => new DataFrame([{c1: 1, c2: 3}]).join([])).name,
700+
tryCatch(() => new DataFrame([{c1: 1, c2: 3}]).innerJoin([])).name,
688701
'TypeError',
689702
'joined with not a DataFrame, throwing TypeError.'
690703
);

tests/groupedDataFrame-test.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,12 @@ test('GroupedDataFrame can be ', (assert) => {
3333
);
3434

3535
assert.deepEqual(new GroupedDataFrame(df, 'column1').toCollection().map(x => Object.keys(x)),
36-
[['groupKey', 'group'], ['groupKey', 'group'], ['groupKey', 'group']],
36+
[['groupKey', 'hash', 'group'], ['groupKey', 'hash', 'group'], ['groupKey', 'hash', 'group']],
3737
'converted into a collection of dictionnaries containing each group.'
3838
);
3939

4040
assert.deepEqual([...new GroupedDataFrame(df, 'column1')].map(x => Object.keys(x)),
41-
[['groupKey', 'group'], ['groupKey', 'group'], ['groupKey', 'group']],
41+
[['groupKey', 'hash', 'group'], ['groupKey', 'hash', 'group'], ['groupKey', 'hash', 'group']],
4242
'converted into a collection of dictionnaries containing each group when destructured.'
4343
);
4444

tests/sql-test.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,13 +79,13 @@ test('DataFrame sql module can ', (assert) => {
7979

8080
assert.deepEqual(
8181
DataFrame.sql.request('SELECT * FROM tmp JOIN tmp2 ON id').toDict(),
82-
df1.join(df2, 'id').toDict(),
82+
df1.innerJoin(df2, 'id').toDict(),
8383
'select everything from a join (inner) between 2 tables.'
8484
);
8585

8686
assert.deepEqual(
8787
DataFrame.sql.request('SELECT * FROM tmp JOIN tmp2 ON id WHERE column1 != undefined').toDict(),
88-
df1.join(df2, 'id').filter(row => row.get('column1') !== undefined).toDict(),
88+
df1.innerJoin(df2, 'id').filter(row => row.get('column1') !== undefined).toDict(),
8989
'select everything from a join chained with a filter.'
9090
);
9191

0 commit comments

Comments
 (0)