diff --git a/scripts/README.md b/scripts/README.md index 04312ae..6e11a0f 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -16,7 +16,7 @@ - `page_id` -> Id of the page - (`page_namespace`) -> We keep only if equals 0 (= namespace of this page) - `page_title` -> Title of this page -- `page_is_redirect` -> Boolean wether this page is a redirect +- `page_is_redirect` -> Boolean whether this page is a redirect - Ignore the eight following ### redirects.txt @@ -28,19 +28,19 @@ ## Joining the tables ### redirects.with_ids.txt (replace_titles_in_redirects_file.py) -Replaces for each redirection, `rd_title` with the targetted `page_id` by matching on `page_title`. +Replaces for each redirection, `rd_title` with the targeted `page_id` by matching on `page_title`. The targetted page_id is then computed as a redirect recursively, until we get on a "final" page. - `rd_from` -> The id of the page we are redirected from - `page_id` -> The id of the page we get to following redirections recursively ### targets.with_ids.txt (replace_titles_and_redirects_in_targets_file.py) -Replaces, for each linktarget, `lt_title` with the targetted `page_id` by matching on `page_title`. +Replaces, for each linktarget, `lt_title` with the targeted `page_id` by matching on `page_title`. We then compute the "final" page obtained from this page following redirection, with the file `redirects.with_ids.txt`. - `lt_id` -> Id of this link - `page_id` -> The id of the page this link is pointing to, after having followed all redirections ### links.with_ids.txt (replace_titles_and_redirects_in_links_file.py) -Replaces, for each pagelink, `lt_id` with the targetted `page_id` by joining with `links.with_ids.txt`. +Replaces, for each pagelink, `lt_id` with the targeted `page_id` by joining with `links.with_ids.txt`. - `pl_from` -> Id of the "from" page, after having followed all redirections - `page_id` -> Id of the "to" page, after having followed all redirections @@ -65,5 +65,16 @@ The file `links.grouped_by_target_id.txt` is like this - `pl_target` -> Id of the "target" page ### links.with_counts.txt (combine_grouped_links_files.py) +We *stream-merge* ( we dont load the two files into memory directly ) the two files `links.grouped_by_source_id.txt.gz` and `links.grouped_by_target_id.txt.gz` into a single sorted links file `links.with_counts.txt`. ## Making the database +We create three tables : `redirects`, `tables`, `links` from the files : `redirects.with_ids.txt.gz`, `pages.pruned.txt.gz`, `links.with_counts.txt.gz` + + +# Notes on building the database locally (on Debian based machines) : + +Depending on your hardware, you might need to change some values in the `buildDatabase.sh` script, most notably how much of the cpu the `sort` command should take, you can define the value as a percentage or in gigabytes, while running on a DigitalOcean droplet with 16 gb of ram it was best to give the sort command 4 gb of ram, The whole database generation took about two hours ( 14gb final size for the sdow.sqlite file ). + +If the script keeps exiting with no error message, its possible that the system is killing it for taking too much ressources, you can investigate that by running : `dmesg | grep -i "killed process"` + +after creating the database you can inspect its content using `sqlitebrowser` diff --git a/scripts/buildDatabase.sh b/scripts/buildDatabase.sh index 5f622ed..5c29169 100755 --- a/scripts/buildDatabase.sh +++ b/scripts/buildDatabase.sh @@ -24,7 +24,7 @@ else fi # Root directory is that of this script -ROOT_DIR=$(dirname "$0") +ROOT_DIR=$(cd "$(dirname "$0")" && pwd) DOWNLOAD_URL="https://dumps.wikimedia.org/${WLANG}wiki/$DOWNLOAD_DATE" TORRENT_URL="https://dump-torrents.toolforge.org/${WLANG}wiki/$DOWNLOAD_DATE" @@ -188,8 +188,8 @@ if $DELETE_PROGRESSIVELY; then rm $TARGETS_FILENAME; fi if [ ! -f redirects.with_ids.txt.gz ]; then echo echo "[INFO] Replacing titles in redirects file" - time python "$ROOT_DIR/replace_titles_in_redirects_file.py" pages.txt.gz redirects.txt.gz \ - | sort -S 100% -t $'\t' -k 1n,1n \ + time python3 "$ROOT_DIR/replace_titles_in_redirects_file.py" pages.txt.gz redirects.txt.gz \ + | sort -S 80% -t $'\t' -k 1n,1n \ | pigz --fast > redirects.with_ids.txt.gz.tmp mv redirects.with_ids.txt.gz.tmp redirects.with_ids.txt.gz else @@ -200,7 +200,7 @@ if $DELETE_PROGRESSIVELY; then rm redirects.txt.gz; fi if [ ! -f targets.with_ids.txt.gz ]; then echo echo "[INFO] Replacing titles and redirects in targets file" - time python "$ROOT_DIR/replace_titles_and_redirects_in_targets_file.py" pages.txt.gz redirects.with_ids.txt.gz targets.txt.gz \ + time python3 "$ROOT_DIR/replace_titles_and_redirects_in_targets_file.py" pages.txt.gz redirects.with_ids.txt.gz targets.txt.gz \ | pigz --fast > targets.with_ids.txt.gz.tmp mv targets.with_ids.txt.gz.tmp targets.with_ids.txt.gz else @@ -211,7 +211,7 @@ if $DELETE_PROGRESSIVELY; then rm targets.txt.gz; fi if [ ! -f links.with_ids.txt.gz ]; then echo echo "[INFO] Replacing titles and redirects in links file" - time python "$ROOT_DIR/replace_titles_and_redirects_in_links_file.py" pages.txt.gz redirects.with_ids.txt.gz targets.with_ids.txt.gz links.txt.gz \ + time python3 "$ROOT_DIR/replace_titles_and_redirects_in_links_file.py" pages.txt.gz redirects.with_ids.txt.gz targets.with_ids.txt.gz links.txt.gz \ | pigz --fast > links.with_ids.txt.gz.tmp mv links.with_ids.txt.gz.tmp links.with_ids.txt.gz else @@ -222,7 +222,7 @@ if $DELETE_PROGRESSIVELY; then rm links.txt.gz targets.with_ids.txt.gz; fi if [ ! -f pages.pruned.txt.gz ]; then echo echo "[INFO] Pruning pages which are marked as redirects but with no redirect" - time python "$ROOT_DIR/prune_pages_file.py" pages.txt.gz redirects.with_ids.txt.gz \ + time python3 "$ROOT_DIR/prune_pages_file.py" pages.txt.gz redirects.with_ids.txt.gz \ | pigz --fast > pages.pruned.txt.gz else echo "[WARN] Already pruned pages which are marked as redirects but with no redirect" @@ -236,7 +236,7 @@ if [ ! -f links.sorted_by_source_id.txt.gz ]; then echo echo "[INFO] Sorting links file by source page ID" time pigz -dc links.with_ids.txt.gz \ - | sort -S 80% -t $'\t' -k 1n,1n \ + | sort -T . -S 80% -t $'\t' -k 1n,1n \ | uniq \ | pigz --fast > links.sorted_by_source_id.txt.gz.tmp mv links.sorted_by_source_id.txt.gz.tmp links.sorted_by_source_id.txt.gz @@ -248,7 +248,7 @@ if [ ! -f links.sorted_by_target_id.txt.gz ]; then echo echo "[INFO] Sorting links file by target page ID" time pigz -dc links.with_ids.txt.gz \ - | sort -S 80% -t $'\t' -k 2n,2n \ + | sort -T . -S 80% -t $'\t' -k 2n,2n \ | uniq \ | pigz --fast > links.sorted_by_target_id.txt.gz.tmp mv links.sorted_by_target_id.txt.gz.tmp links.sorted_by_target_id.txt.gz @@ -291,7 +291,7 @@ if $DELETE_PROGRESSIVELY; then rm links.sorted_by_target_id.txt.gz; fi if [ ! -f links.with_counts.txt.gz ]; then echo echo "[INFO] Combining grouped links files" - time python "$ROOT_DIR/combine_grouped_links_files.py" links.grouped_by_source_id.txt.gz links.grouped_by_target_id.txt.gz \ + time python3 "$ROOT_DIR/combine_grouped_links_files.py" links.grouped_by_source_id.txt.gz links.grouped_by_target_id.txt.gz \ | pigz --fast > links.with_counts.txt.gz.tmp mv links.with_counts.txt.gz.tmp links.with_counts.txt.gz else diff --git a/scripts/combine_grouped_links_files.py b/scripts/combine_grouped_links_files.py index 35a1c7b..19fed21 100755 --- a/scripts/combine_grouped_links_files.py +++ b/scripts/combine_grouped_links_files.py @@ -4,51 +4,72 @@ Output is written to stdout. """ -import io import sys import gzip -from collections import defaultdict -# Validate input arguments. +# validate input arguments. if len(sys.argv) < 2: print('[ERROR] Not enough arguments provided!') - print('[INFO] Usage: {0} '.format(sys.argv[0])) + print('[INFO] Usage: {0} '.format(sys.argv[0]), file=sys.stderr) sys.exit() OUTGOING_LINKS_FILE = sys.argv[1] INCOMING_LINKS_FILE = sys.argv[2] if not OUTGOING_LINKS_FILE.endswith('.gz'): - print('[ERROR] Outgoing links file must be gzipped.') + print('[ERROR] Outgoing links file must be gzipped.', file=sys.stderr) sys.exit() if not INCOMING_LINKS_FILE.endswith('.gz'): - print('[ERROR] Incoming links file must be gzipped.') + print('[ERROR] Incoming links file must be gzipped.', file=sys.stderr) sys.exit() -# Create a dictionary of page IDs to their incoming and outgoing links. -LINKS = defaultdict(lambda: defaultdict(str)) -# outgoing is [0], incoming is [1] -for line in io.BufferedReader(gzip.open(OUTGOING_LINKS_FILE, 'rb')): - [source_page_id, target_page_ids] = line.rstrip(b'\n').split(b'\t') - LINKS[int(source_page_id)][0] = target_page_ids +def parse_line(line): + parts = line.rstrip(b'\n').split(b'\t', 1) + return (int(parts[0]), parts[1] if len(parts) > 1 else b'') -for line in io.BufferedReader(gzip.open(INCOMING_LINKS_FILE, 'rb')): - [target_page_id, source_page_ids] = line.rstrip(b'\n').split(b'\t') - LINKS[int(target_page_id)][1] = source_page_ids +def file_iterator(filename): + with gzip.open(filename, 'rb') as f: + for line in f: + yield parse_line(line) -# For each page in the links dictionary, print out its incoming and outgoing links as well as their -# counts. -for page_id, links in LINKS.items(): - outgoing_links = links.get(0, b'') - outgoing_links_count = 0 if outgoing_links==b'' else len( - outgoing_links.split(b'|')) +# Merge the two sorted files, we're using gnerators instead of dicts to stream the content +# and not load the entire files into memory, this helps with RAM consumption a lot. - incoming_links = links.get(1, b'') - incoming_links_count = 0 if incoming_links==b'' else len( - incoming_links.split(b'|')) +outgoing_iter = file_iterator(OUTGOING_LINKS_FILE) +incoming_iter = file_iterator(INCOMING_LINKS_FILE) - columns = [str(page_id).encode(), str(outgoing_links_count).encode(), str( - incoming_links_count).encode(), outgoing_links, incoming_links] +outgoing_current = next(outgoing_iter, None) +incoming_current = next(incoming_iter, None) - print(b'\t'.join(columns).decode()) +while outgoing_current is not None or incoming_current is not None: + if outgoing_current is None: + page_id, incoming_links = incoming_current + outgoing_links = b'' + incoming_current = next(incoming_iter, None) + elif incoming_current is None: + page_id, outgoing_links = outgoing_current + incoming_links = b'' + outgoing_current = next(outgoing_iter, None) + elif outgoing_current[0] < incoming_current[0]: + page_id, outgoing_links = outgoing_current + incoming_links = b'' + outgoing_current = next(outgoing_iter, None) + elif incoming_current[0] < outgoing_current[0]: + page_id, incoming_links = incoming_current + outgoing_links = b'' + incoming_current = next(incoming_iter, None) + else: + page_id = outgoing_current[0] + outgoing_links = outgoing_current[1] + incoming_links = incoming_current[1] + outgoing_current = next(outgoing_iter, None) + incoming_current = next(incoming_iter, None) + + outgoing_links_count = 0 if outgoing_links == b'' else len(outgoing_links.split(b'|')) + incoming_links_count = 0 if incoming_links == b'' else len(incoming_links.split(b'|')) + + columns = [str(page_id).encode(), str(outgoing_links_count).encode(), + str(incoming_links_count).encode(), outgoing_links, incoming_links] + + diff --git a/scripts/prune_pages_file.py b/scripts/prune_pages_file.py index a4bdf60..8c32844 100644 --- a/scripts/prune_pages_file.py +++ b/scripts/prune_pages_file.py @@ -37,5 +37,5 @@ for line in io.BufferedReader(gzip.open(PAGES_FILE, 'rb')): [page_id, page_title, is_redirect] = line.rstrip(b'\n').split(b'\t') - if True or is_redirect == '0' or page_id in REDIRECTS: + if is_redirect == '0' or page_id in REDIRECTS: print(b'\t'.join([page_id, page_title, is_redirect]).decode()) diff --git a/scripts/replace_titles_in_redirects_file.py.dis b/scripts/replace_titles_in_redirects_file.py.dis index a423414..65b0dca 100755 --- a/scripts/replace_titles_in_redirects_file.py.dis +++ b/scripts/replace_titles_in_redirects_file.py.dis @@ -29,7 +29,6 @@ if not REDIRECTS_FILE.endswith('.gz'): ALL_TARGET_IDS = set() TARGET_TITLES_TO_IDS = {} for line in io.BufferedReader(gzip.open(TARGETS_FILE, 'rb')): - print("LALIGNE",line.rstrip(b'\n').split(b'\t')) [page_id, page_title,_] = line.rstrip(b'\n').split(b'\t') ALL_TARGET_IDS.add(page_id) TARGET_TITLES_TO_IDS[page_title] = page_id