diff --git a/machine-learning-box/gender_age_prediction/.ruby-version b/machine-learning-box/gender_age_prediction/.ruby-version new file mode 100644 index 00000000..ec1cf33c --- /dev/null +++ b/machine-learning-box/gender_age_prediction/.ruby-version @@ -0,0 +1 @@ +2.6.3 diff --git a/machine-learning-box/gender_age_prediction/blogposts.dig b/machine-learning-box/gender_age_prediction/blogposts.dig new file mode 100644 index 00000000..081d1796 --- /dev/null +++ b/machine-learning-box/gender_age_prediction/blogposts.dig @@ -0,0 +1,21 @@ +_export: + !include : config/params.yml + td: + database: ${target_db} + engine: hive + priority: ${job_priority} + ++tokenize: + td>: queries/blogposts/tokenize_en.sql + create_table: exploded + ++vectorize: + td>: queries/blogposts/ftvec.sql + create_table: input + engine_version: experimental + ++predict: + _parallel: true + + +rf_predict: + call>: blogposts_rf_predict.dig diff --git a/machine-learning-box/gender_age_prediction/blogposts.md b/machine-learning-box/gender_age_prediction/blogposts.md new file mode 100644 index 00000000..37365951 --- /dev/null +++ b/machine-learning-box/gender_age_prediction/blogposts.md @@ -0,0 +1,46 @@ +Please download dataset from [Kaggle](https://www.kaggle.com/tomlisankie/blog-posts-labeled-with-age-and-gender/download) first. + +Then, you need a Kaggle account for download. Please set your kaggle API credentials in ~/.kaggle/kaggle.json following [this instruction](https://github.com/Kaggle/kaggle-api#api-credentials). + +## Prepare data + +Please download dataset from kaggle and run the following data preprocessing. + + +```sh +pip install kaggle + +chmod 600 ~/.kaggle/kaggle.json +kaggle datasets download tomlisankie/blog-posts-labeled-with-age-and-gender + +unzip blog-posts-labeled-with-age-and-gender.zip + +brew install jq + +echo -e "userid\tpost\tage\tgender" > blogposts.tsv +jq -r -c '.[] | [.post,.age,.gender] | @tsv' train.json | awk '{print NR"\t"$0}' >> blogposts.tsv +jq -r -c '.[] | [.post,.age,.gender] | @tsv' test.json | awk '{print 526812+NR"\t"$0}' >> blogposts.tsv +``` + +## Import data to Treasure Data + +Please import prepared blog post data to Treasure Data as follows: + +```sh +# create database +td db:create td_test + +# load training data +td table:create td_test blogposts +td import:auto --auto-create td_test.blogposts --format tsv --column-header --time-value `date +%s` --column-types "int,string,int,string" ./blogposts.tsv +``` + +# Run gender-age prediction workflow + +```sh +# Push workflows to Treasure workflow +$ td wf push td_test + +# Run workflow from command line (also runnable from GUI) +$ td wf run blogposts.dig +``` \ No newline at end of file diff --git a/machine-learning-box/gender_age_prediction/blogposts_rf_predict.dig b/machine-learning-box/gender_age_prediction/blogposts_rf_predict.dig new file mode 100644 index 00000000..e9b62210 --- /dev/null +++ b/machine-learning-box/gender_age_prediction/blogposts_rf_predict.dig @@ -0,0 +1,112 @@ +_export: + !include : config/params.yml + td: + database: ${target_db} + engine: hive +# engine_version: experimental + priority: ${job_priority} + ++preparation: + + +label_mapping: + td>: queries/rf/label_mapping.sql + create_table: label_mapping + + +rf_input: + td>: queries/rf/rf_input.sql + create_table: rf_input + + +compute_class_weight: + td>: queries/rf/compute_class_weight.sql + create_table: class_weight + engine: presto + + +prepare_model_tables: + td_ddl>: + empty_tables: ["rf_model_cv", "rf_model"] + ++store_weights: + td>: queries/rf/store_weights.sql + store_last_results: true + ++validate_and_train: + _export: + class_weights: ${td.last_results.weights} + + +cross_validation: + + +parallel_train: + for_range>: + from: 0 + to: ${rf_num_train_parallel} + step: 1 + _parallel: true + _do: + +train: + td>: queries/rf/train_cv.sql + insert_into: rf_model_cv + seed: ${rf_seed + range.index * 100} + + +predict: + td>: queries/rf/predict_cv.sql + create_table: rf_predicted_cv + +# +topk_predict: +# td>: queries/blogposts/rf/topk_predict.sql +# create_table: rf_topk_predict +# +# +evaluation_measures: +# td>: queries/rf/eval.sql + + +eval: + _parallel: true + + +confusion_matrix: + + +confusion_matrix_table: + td>: queries/rf/confusion_matrix.sql + engine: presto + create_table: rf_confusion_matrix + + +confusion_matrix_pivot: + td>: queries/blogposts/rf/confusion_matrix_pivot.sql + engine: presto + + +actual_predict_diff: + td>: queries/rf/actual_predict_diff.sql + engine: presto + +# +heuristic_calibration: +# +# +calibration_prediction: +# td>: queries/rf/calibrate_prediction.sql +# engine: presto +# create_table: rf_predicted_cv_calibrated +# +# +eval_calibration: +# td>: queries/rf/confusion_matrix_calibrated.sql +# engine: presto +# create_table: rf_confusion_matrix_calibrated + + +train_predict: + + +parallel_train: + for_range>: + from: 0 + to: ${rf_num_train_parallel} + step: 1 + _parallel: true + _do: + +train: + td>: queries/rf/train.sql + insert_into: rf_model + seed: ${rf_seed + range.index * 100} + + +prediction: + td>: queries/blogposts/rf/predict.sql + create_table: rf_predicted + + +complement_prediction: + td>: queries/rf/complement_prediction.sql + engine: presto + create_table: rf_complemented diff --git a/machine-learning-box/gender_age_prediction/config/params.yml b/machine-learning-box/gender_age_prediction/config/params.yml index 6de49538..ad8152a4 100644 --- a/machine-learning-box/gender_age_prediction/config/params.yml +++ b/machine-learning-box/gender_age_prediction/config/params.yml @@ -1,6 +1,7 @@ target_db: td_test -job_priority: -1 +job_priority: 0 +# job_priority: -1 topk_predict: 5 @@ -21,15 +22,15 @@ max_age: 95 # RandomForest rf_seed: 71 -rf_trees: 15 -rf_num_train_parallel: 4 -rf_max_depth: 30 +rf_trees: 10 +rf_num_train_parallel: 5 +rf_max_depth: 10 min_split: 3 min_samples_leaf: 1 # over/down sampling min_class_weight: 0.8 -max_class_weight: 4.0 +max_class_weight: 1.2 # heuristic calibration f15_factor: 1.0 @@ -43,6 +44,13 @@ m25_factor: 1.0 m35_factor: 1.0 m50_factor: 1.0 +f1x_factor: 1.0 +f2x_factor: 1.0 +f3x_factor: 1.0 +m1x_factor: 1.0 +m2x_factor: 1.0 +m3x_factor: 1.0 + #f15_factor: 1.1 #f20_factor: 1.4 #f25_factor: 0.75 diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/ftvec.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/ftvec.sql new file mode 100644 index 00000000..cec397d4 --- /dev/null +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/ftvec.sql @@ -0,0 +1,85 @@ +-- @TD distribute_strategy: aggressive +WITH term_frequency as ( + select + t1.userid, + t2.word, + t2.freq + from ( + select + userid, + tf(word) as word2freq + from + exploded + group by + userid + ) t1 + LATERAL VIEW explode(word2freq) t2 as word, freq +), +document_frequency AS ( + select + word, + count(distinct userid) docs + from + exploded + group by + word +), +doc_len as ( + select + userid, + count(1) as dl, + avg(count(1)) over () as avgdl, + APPROX_COUNT_DISTINCT(userid) over () as total_docs + from + exploded + group by + userid +), +scores as ( + select + tf.userid, + tf.word, + bm25(tf.freq, dl.dl, dl.avgdl, dl.total_docs, df.docs) as bm25 + -- tfidf(tf.freq, df.docs, dl.total_docs) as tfidf + from + term_frequency tf + JOIN document_frequency df ON (tf.word = df.word) + JOIN doc_len dl ON (tf.userid = dl.userid) + where + df.docs >= 2 +), +ftvec as ( + select + userid, + to_ordered_list(feature(word,bm25), bm25, '-k 100') as features + from + scores + group by + userid +), +ages as ( + select + userid, + concat(if(gender='male','M','F'), + CASE + WHEN age >= 33 THEN '3x' + WHEN age >= 23 THEN '2x' + ELSE '1x' + END + ) as gender_age + from + blogposts +) +-- DIGDAG_INSERT_LINE +SELECT + l.userid, + l.features, + r.gender_age, + -- random sampling + rand(42) as rnd, + -- stratified sampling + count(1) over (partition by r.gender_age) as per_label_count, + rank() over (partition by r.gender_age order by rand(41)) as rank_in_label +FROM + ftvec l + JOIN ages r ON (l.userid = r.userid) \ No newline at end of file diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/rf/confusion_matrix_pivot.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/confusion_matrix_pivot.sql new file mode 100644 index 00000000..ed1fa392 --- /dev/null +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/confusion_matrix_pivot.sql @@ -0,0 +1,21 @@ +-- DIGDAG_INSERT_LINE +WITH tmp as ( + SELECT + actual, + max(CASE WHEN predicted = 'F1x' THEN cnt ELSE 0 END) AS F1x, + max(CASE WHEN predicted = 'F2x' THEN cnt ELSE 0 END) AS F2x, + max(CASE WHEN predicted = 'F3x' THEN cnt ELSE 0 END) AS F3x, + max(CASE WHEN predicted = 'M1x' THEN cnt ELSE 0 END) AS M1x, + max(CASE WHEN predicted = 'M2x' THEN cnt ELSE 0 END) AS M2x, + max(CASE WHEN predicted = 'M3x' THEN cnt ELSE 0 END) AS M3x + FROM rf_confusion_matrix + GROUP BY actual +) +select + actual, + F1x,F2x,F3x, + M1x,M2x,M3x +from + tmp +order by + actual asc diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/rf/predict.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/predict.sql new file mode 100644 index 00000000..d743a0cc --- /dev/null +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/predict.sql @@ -0,0 +1,36 @@ +-- @TD enable_cartesian_product: true +-- @TD autoconvertjoin: true +WITH t2 as ( + SELECT + userid, + rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted + FROM ( + SELECT + t.userid, + p.model_weight, + tree_predict(p.model_id, p.model, t.features, '-classification') as predicted + FROM + rf_model p + LEFT OUTER JOIN rf_input t + ) t1 + GROUP BY + userid +) +-- DIGDAG_INSERT_LINE +SELECT + l.userid, + r.label, + l.predicted.probability, + l.predicted.probabilities as raw_probability, + array( -- calibration + l.predicted.probabilities[0] * ${f1x_factor}, -- F1x + l.predicted.probabilities[1] * ${f2x_factor}, -- F2x + l.predicted.probabilities[2] * ${f3x_factor}, -- F3x + l.predicted.probabilities[3] * ${m1x_factor}, -- M1x + l.predicted.probabilities[4] * ${m2x_factor}, -- M2x + l.predicted.probabilities[5] * ${m3x_factor} -- M3x + ) as probabilities +FROM + t2 l + JOIN label_mapping r ON (l.predicted.label = r.label_id) + diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/rf/topk_predict.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/topk_predict.sql new file mode 100644 index 00000000..849067dc --- /dev/null +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/topk_predict.sql @@ -0,0 +1,64 @@ +WITH test_data as ( + SELECT + userid, collect_set(gender_age) as actual + FROM + rf_input + WHERE + gender_age is not null + AND rnd > ${train_rate} -- using 30% for testing + GROUP BY + userid +), +calibrated_prediction as ( + SELECT + userid, + array( -- calibration + probabilities[0] * ${f1x_factor}, -- F1x + probabilities[1] * ${f2x_factor}, -- F2x + probabilities[2] * ${f3x_factor}, -- F3x + probabilities[3] * ${m1x_factor}, -- M1x + probabilities[4] * ${m2x_factor}, -- M2x + probabilities[5] * ${m3x_factor} -- M3x + ) as probabilities + FROM + rf_predicted_cv +), +exploded as ( + select + l.userid, + r.pos, + r.prob + from + calibrated_prediction l + LATERAL VIEW posexplode(l.probabilities) r as pos, prob +), +predicted as ( + select + l.userid, + to_ordered_list( + r.label, -- value + l.prob, -- key + '-k ${topk_predict}' + ) as predicted, + to_ordered_list( + concat(r.label, ':', l.prob), -- value + l.prob, -- key + '-k ${topk_predict}' + ) as predicted_with_weight + from + exploded l + JOIN label_mapping r ON (l.pos = r.label_id) + where + l.prob > 0 + group by + 1 +) +-- DIGDAG_INSERT_LINE +SELECT + l.userid, + l.actual, + r.predicted, + r.predicted_with_weight +FROM + test_data l + JOIN predicted r ON (l.userid = r.userid) \ No newline at end of file diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql new file mode 100644 index 00000000..e87e56a1 --- /dev/null +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql @@ -0,0 +1,13 @@ +-- @TD distribute_strategy: aggressive +-- DIGDAG_INSERT_LINE +SELECT + userid, + translate(r.word,':','\;') as word +FROM + blogposts l + LATERAL VIEW explode( + tokenize(normalize_unicode(translate(post,':','\;'),'NFKC'),true) + ) r as word +WHERE + NOT is_stopword(r.word) AND + length(r.word) >= 2 AND cast(r.word AS double) IS NULL \ No newline at end of file diff --git a/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix.sql b/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix.sql index f346f816..66217106 100644 --- a/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix.sql +++ b/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix.sql @@ -7,9 +7,7 @@ WITH test_data as ( WHERE gender_age is not null AND rnd > ${train_rate} -- using 30% for testing - GROUP BY - userid -), +) select l.actual, r.label as predicted, diff --git a/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix_pivot.sql b/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix_pivot.sql index a4a28e35..de8f4e9a 100644 --- a/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix_pivot.sql +++ b/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix_pivot.sql @@ -22,4 +22,4 @@ select from tmp order by - actual asc + actual asc \ No newline at end of file diff --git a/machine-learning-box/gender_age_prediction/queries/rf/predict.sql b/machine-learning-box/gender_age_prediction/queries/rf/predict.sql index 08098af9..fa159529 100644 --- a/machine-learning-box/gender_age_prediction/queries/rf/predict.sql +++ b/machine-learning-box/gender_age_prediction/queries/rf/predict.sql @@ -36,5 +36,4 @@ SELECT ) as probabilities FROM t2 l - JOIN label_mapping r ON (l.predicted.label = r.label_id) - + JOIN label_mapping r ON (l.predicted.label = r.label_id) \ No newline at end of file