11#! /bin/bash
2-
32set -euo pipefail
43
54# Force default language for output sorting to be bytewise. Necessary to ensure uniformity amongst
65# UNIX commands.
76export LC_ALL=C
87
8+ # These variables can be set by external environment
9+ WLANG=' ' ${WLANG:- en}
10+ OUT_DIR=" ${OUT_DIR:- dump} "
11+ DELETE_PROGRESSIVELY=${DELETE_PROGRESSIVELY:- false}
12+
913# By default, the latest Wikipedia dump will be downloaded. If a download date in the format
1014# YYYYMMDD is provided as the first argument, it will be used instead.
1115if [[ $# -eq 0 ]]; then
12- DOWNLOAD_DATE=$( wget -q -O- https://dumps.wikimedia.org/enwiki / | grep -Po ' \d{8}' | sort | tail -n1)
16+ DOWNLOAD_DATE=$( wget -q -O- https://dumps.wikimedia.org/${WLANG} wiki / | grep -Po ' \d{8}' | sort | tail -n1)
1317else
1418 if [ ${# 1} -ne 8 ]; then
1519 echo " [ERROR] Invalid download date provided: $1 "
1923 fi
2024fi
2125
22- ROOT_DIR= ` pwd `
23- OUT_DIR= " dump "
26+ # Root directory is that of this script
27+ ROOT_DIR= $( dirname " $0 " )
2428
25- DOWNLOAD_URL=" https://dumps.wikimedia.org/enwiki/$DOWNLOAD_DATE "
26- TORRENT_URL=" https://dump-torrents.toolforge.org/enwiki/$DOWNLOAD_DATE "
27-
28- SHA1SUM_FILENAME=" enwiki-$DOWNLOAD_DATE -sha1sums.txt"
29- REDIRECTS_FILENAME=" enwiki-$DOWNLOAD_DATE -redirect.sql.gz"
30- PAGES_FILENAME=" enwiki-$DOWNLOAD_DATE -page.sql.gz"
31- LINKS_FILENAME=" enwiki-$DOWNLOAD_DATE -pagelinks.sql.gz"
29+ DOWNLOAD_URL=" https://dumps.wikimedia.org/${WLANG} wiki/$DOWNLOAD_DATE "
30+ TORRENT_URL=" https://dump-torrents.toolforge.org/${WLANG} wiki/$DOWNLOAD_DATE "
3231
32+ SHA1SUM_FILENAME=" ${WLANG} wiki-$DOWNLOAD_DATE -sha1sums.txt"
33+ REDIRECTS_FILENAME=" ${WLANG} wiki-$DOWNLOAD_DATE -redirect.sql.gz"
34+ PAGES_FILENAME=" ${WLANG} wiki-$DOWNLOAD_DATE -page.sql.gz"
35+ LINKS_FILENAME=" ${WLANG} wiki-$DOWNLOAD_DATE -pagelinks.sql.gz"
36+ TARGETS_FILENAME=" ${WLANG} wiki-$DOWNLOAD_DATE -linktarget.sql.gz"
3337
3438# Make the output directory if it doesn't already exist and move to it
3539mkdir -p $OUT_DIR
@@ -79,6 +83,7 @@ download_file "sha1sums" $SHA1SUM_FILENAME
7983download_file " redirects" $REDIRECTS_FILENAME
8084download_file " pages" $PAGES_FILENAME
8185download_file " links" $LINKS_FILENAME
86+ download_file " targets" $TARGETS_FILENAME
8287
8388# #########################
8489# TRIM WIKIPEDIA DUMPS #
@@ -105,7 +110,7 @@ if [ ! -f redirects.txt.gz ]; then
105110else
106111 echo " [WARN] Already trimmed redirects file"
107112fi
108-
113+ if $DELETE_PROGRESSIVELY ; then rm $REDIRECTS_FILENAME ; fi
109114if [ ! -f pages.txt.gz ]; then
110115 echo
111116 echo " [INFO] Trimming pages file"
@@ -118,16 +123,16 @@ if [ ! -f pages.txt.gz ]; then
118123 # Splice out the page title and whether or not the page is a redirect
119124 # Zip into output file
120125 time pigz -dc $PAGES_FILENAME \
121- | sed -n ' s/^INSERT INTO `page` VALUES (//p' \
122- | sed -e ' s/),(/\' $' \n /g' \
123- | egrep " ^[0-9]+,0," \
124- | sed -e $" s/,0,'/\t/" \
125- | sed -e $" s/',[^,]*,\([01]\).*/\t\1/" \
126+ | sed -n ' s/^INSERT INTO `page` VALUES //p' \
127+ | egrep -o " \([0-9]+,0,'([^']*(\\\\ ')?)+',[01]," \
128+ | sed -re $" s/^\(([0-9]+),0,'/\1\t/" \
129+ | sed -re $" s/',([01]),/\t\1/" \
126130 | pigz --fast > pages.txt.gz.tmp
127131 mv pages.txt.gz.tmp pages.txt.gz
128132else
129133 echo " [WARN] Already trimmed pages file"
130134fi
135+ if $DELETE_PROGRESSIVELY ; then rm $PAGES_FILENAME ; fi
131136
132137if [ ! -f links.txt.gz ]; then
133138 echo
@@ -143,14 +148,38 @@ if [ ! -f links.txt.gz ]; then
143148 time pigz -dc $LINKS_FILENAME \
144149 | sed -n ' s/^INSERT INTO `pagelinks` VALUES (//p' \
145150 | sed -e ' s/),(/\' $' \n /g' \
146- | egrep " ^[0-9]+,0,.*,0$" \
147- | sed -e $" s/,0,'/\t/g" \
148- | sed -e " s/',0//g" \
151+ | egrep " ^[0-9]+,0,[0-9]+$" \
152+ | sed -e $" s/,0,/\t/g" \
149153 | pigz --fast > links.txt.gz.tmp
150154 mv links.txt.gz.tmp links.txt.gz
151155else
152156 echo " [WARN] Already trimmed links file"
153157fi
158+ if $DELETE_PROGRESSIVELY ; then rm $LINKS_FILENAME ; fi
159+
160+ if [ ! -f targets.txt.gz ]; then
161+ echo
162+ echo " [INFO] Trimming targets file"
163+
164+ # Unzip
165+ # Remove all lines that don't start with INSERT INTO...
166+ # Split into individual records
167+ # Only keep records in namespace 0
168+ # Replace namespace with a tab
169+ # Remove everything starting at the to page name's closing apostrophe
170+ # Zip into output file
171+ time pigz -dc $TARGETS_FILENAME \
172+ | sed -n ' s/^INSERT INTO `linktarget` VALUES (//p' \
173+ | sed -e ' s/),(/\' $' \n /g' \
174+ | egrep " ^[0-9]+,0,.*$" \
175+ | sed -e $" s/,0,'/\t/g" \
176+ | sed -e " s/'$//g" \
177+ | pigz --fast > targets.txt.gz.tmp
178+ mv targets.txt.gz.tmp targets.txt.gz
179+ else
180+ echo " [WARN] Already trimmed targets file"
181+ fi
182+ if $DELETE_PROGRESSIVELY ; then rm $TARGETS_FILENAME ; fi
154183
155184
156185# ##########################################
@@ -166,16 +195,29 @@ if [ ! -f redirects.with_ids.txt.gz ]; then
166195else
167196 echo " [WARN] Already replaced titles in redirects file"
168197fi
198+ if $DELETE_PROGRESSIVELY ; then rm redirects.txt.gz; fi
199+
200+ if [ ! -f targets.with_ids.txt.gz ]; then
201+ echo
202+ echo " [INFO] Replacing titles and redirects in targets file"
203+ time python " $ROOT_DIR /replace_titles_and_redirects_in_targets_file.py" pages.txt.gz redirects.with_ids.txt.gz targets.txt.gz \
204+ | pigz --fast > targets.with_ids.txt.gz.tmp
205+ mv targets.with_ids.txt.gz.tmp targets.with_ids.txt.gz
206+ else
207+ echo " [WARN] Already replaced titles and redirects in targets file"
208+ fi
209+ if $DELETE_PROGRESSIVELY ; then rm targets.txt.gz; fi
169210
170211if [ ! -f links.with_ids.txt.gz ]; then
171212 echo
172213 echo " [INFO] Replacing titles and redirects in links file"
173- time python " $ROOT_DIR /replace_titles_and_redirects_in_links_file.py" pages.txt.gz redirects.with_ids.txt.gz links.txt.gz \
214+ time python " $ROOT_DIR /replace_titles_and_redirects_in_links_file.py" pages.txt.gz redirects.with_ids.txt.gz targets.with_ids.txt.gz links.txt.gz \
174215 | pigz --fast > links.with_ids.txt.gz.tmp
175216 mv links.with_ids.txt.gz.tmp links.with_ids.txt.gz
176217else
177218 echo " [WARN] Already replaced titles and redirects in links file"
178219fi
220+ if $DELETE_PROGRESSIVELY ; then rm links.txt.gz targets.with_ids.txt.gz; fi
179221
180222if [ ! -f pages.pruned.txt.gz ]; then
181223 echo
@@ -185,6 +227,7 @@ if [ ! -f pages.pruned.txt.gz ]; then
185227else
186228 echo " [WARN] Already pruned pages which are marked as redirects but with no redirect"
187229fi
230+ if $DELETE_PROGRESSIVELY ; then rm pages.txt.gz; fi
188231
189232# ####################
190233# SORT LINKS FILE #
@@ -212,6 +255,7 @@ if [ ! -f links.sorted_by_target_id.txt.gz ]; then
212255else
213256 echo " [WARN] Already sorted links file by target page ID"
214257fi
258+ if $DELETE_PROGRESSIVELY ; then rm links.with_ids.txt.gz; fi
215259
216260
217261# ############################
@@ -227,6 +271,7 @@ if [ ! -f links.grouped_by_source_id.txt.gz ]; then
227271else
228272 echo " [WARN] Already grouped source links file by source page ID"
229273fi
274+ if $DELETE_PROGRESSIVELY ; then rm links.sorted_by_source_id.txt.gz; fi
230275
231276if [ ! -f links.grouped_by_target_id.txt.gz ]; then
232277 echo
@@ -237,6 +282,7 @@ if [ ! -f links.grouped_by_target_id.txt.gz ]; then
237282else
238283 echo " [WARN] Already grouped target links file by target page ID"
239284fi
285+ if $DELETE_PROGRESSIVELY ; then rm links.sorted_by_target_id.txt.gz; fi
240286
241287
242288# ###############################
@@ -251,6 +297,7 @@ if [ ! -f links.with_counts.txt.gz ]; then
251297else
252298 echo " [WARN] Already combined grouped links files"
253299fi
300+ if $DELETE_PROGRESSIVELY ; then rm links.grouped_by_source_id.txt.gz links.grouped_by_target_id.txt.gz; fi
254301
255302
256303# ###########################
@@ -260,14 +307,17 @@ if [ ! -f sdow.sqlite ]; then
260307 echo
261308 echo " [INFO] Creating redirects table"
262309 time pigz -dc redirects.with_ids.txt.gz | sqlite3 sdow.sqlite " .read $ROOT_DIR /../sql/createRedirectsTable.sql"
310+ if $DELETE_PROGRESSIVELY ; then rm redirects.with_ids.txt.gz; fi
263311
264312 echo
265313 echo " [INFO] Creating pages table"
266314 time pigz -dc pages.pruned.txt.gz | sqlite3 sdow.sqlite " .read $ROOT_DIR /../sql/createPagesTable.sql"
315+ if $DELETE_PROGRESSIVELY ; then rm pages.pruned.txt.gz; fi
267316
268317 echo
269318 echo " [INFO] Creating links table"
270319 time pigz -dc links.with_counts.txt.gz | sqlite3 sdow.sqlite " .read $ROOT_DIR /../sql/createLinksTable.sql"
320+ if $DELETE_PROGRESSIVELY ; then rm links.with_counts.txt.gz; fi
271321
272322 echo
273323 echo " [INFO] Compressing SQLite file"
0 commit comments