Skip to content

Commit 4810871

Browse files
committed
[wip] add docs and script for generating the sqlite dataset format
1 parent bf347c1 commit 4810871

File tree

3 files changed

+221
-0
lines changed

3 files changed

+221
-0
lines changed

docs/source/cookbook/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Misc
2626
.. toctree::
2727
misc/global_pypi_scan.rst
2828
misc/datasets.rst
29+
misc/pypi_dataset_sqlite_schema.rst
2930
misc/detections.rst
3031
python_2.rst
3132
misc/visitors.rst
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
.. warning::
2+
3+
This part of a documentation is still a work in progress and may not reflect publicly accessible data
4+
5+
6+
PyPI dataset SQLite schema
7+
==========================
8+
9+
Since Aura version 2.1, we started providing the global pypi dataset also in the SQLite database format.
10+
The following is an ER diagram of tables within the dataset:
11+
12+
.. mermaid::
13+
14+
erDiagram
15+
scans ||--o{ detections : contains
16+
detections }|--|| detection_types : has_type
17+
detections ||--o{ tags : contains
18+
tags }|--|| tag_names : has_name
19+
scans {
20+
integer id
21+
varchar package_name
22+
JSON metadata
23+
}
24+
detection_types {
25+
integer id
26+
text name
27+
}
28+
detections {
29+
integer id
30+
integer scan
31+
integer type
32+
text signature
33+
text message
34+
integer score
35+
blob extra
36+
}
37+
tag_names {
38+
integer id
39+
varchar name
40+
}
41+
tags {
42+
integer detection
43+
integer tag
44+
}
45+
46+
47+
The script used to convert the JSON line dataset into SQLite format is located inside the main Aura repository under ``files/dataset_scripts/convert2sqlite.py``. We have identified that the `extra` field in the detection that has a free-form depending on a specific detection occupy a large portion of the overall dataset size. For these reason we have decided to compress the data within the `extra` field to reduce the sqlite database size significantly.
48+
49+
The data has been compressed using the following steps:
50+
51+
- serialize the extra JSON (python dictionary) into a string (text)
52+
- compress the serialized string using `zlib.compress`
53+
- store the compressed bytes as blob in the extra column
54+
55+
You can easily deserialize the data to it's original form by using zlib decompress on the bytes and then loading the string via ``json.loads()``.
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
import os
2+
import sys
3+
import sqlite3
4+
import zlib
5+
6+
import rapidjson
7+
import xxhash
8+
9+
10+
# TODO: reqs:
11+
# pip install xxhash
12+
13+
DETECTION_TYPES = {}
14+
TAG_NAMES = {}
15+
16+
17+
18+
def initialize_schema(connection):
19+
cur = connection.cursor()
20+
21+
cur.execute("""
22+
CREATE TABLE scans (
23+
id INTEGER PRIMARY KEY AUTOINCREMENT,
24+
package_name VARCHAR(256) UNIQUE,
25+
metadata JSON
26+
)
27+
""")
28+
29+
cur.execute("""
30+
CREATE TABLE tag_names (id INTEGER PRIMARY KEY AUTOINCREMENT, name VARCHAR(128) UNIQUE)
31+
""")
32+
33+
cur.execute("""
34+
CREATE TABLE detection_types (id INTEGER PRIMARY KEY AUTOINCREMENT, name text)
35+
""")
36+
37+
cur.execute("""
38+
CREATE TABLE detections
39+
(
40+
id INTEGER PRIMARY KEY AUTOINCREMENT ,
41+
scan INTEGER NOT NULL ,
42+
type INTEGER NOT NULL,
43+
signature TEXT UNIQUE,
44+
message TEXT NOT NULL,
45+
score INTEGER DEFAULT 0,
46+
extra BLOB,
47+
FOREIGN KEY (type) REFERENCES detection_types (id),
48+
FOREIGN KEY (scan) REFERENCES scans (id)
49+
)
50+
""")
51+
52+
cur.execute("""
53+
CREATE TABLE tags (
54+
detection INTEGER NOT NULL,
55+
tag INTEGER NOT NULL,
56+
FOREIGN KEY (detection) REFERENCES detections (id),
57+
FOREIGN KEY (tag) REFERENCES tag_names (id),
58+
PRIMARY KEY (detection, tag)
59+
) WITHOUT ROWID
60+
""")
61+
62+
connection.commit()
63+
64+
65+
def create_or_open(name: str):
66+
exists = os.path.exists(name)
67+
conn = sqlite3.connect(name)
68+
if not exists:
69+
initialize_schema(conn)
70+
else:
71+
load_existing(conn.cursor())
72+
73+
return conn
74+
75+
76+
def load_existing(cursor):
77+
cursor.execute("SELECT id, name FROM tag_names")
78+
79+
for (id, name) in cursor.fetchall():
80+
TAG_NAMES[name] = id
81+
82+
cursor.execute("SELECT id, name FROM detection_types")
83+
84+
for (id, name) in cursor.fetchall():
85+
DETECTION_TYPES[name] = id
86+
87+
88+
def get_tag_id(name, cursor) -> int:
89+
if name not in TAG_NAMES:
90+
cursor.execute("INSERT INTO tag_names (name) VALUES (?)", (name,))
91+
TAG_NAMES[name] = cursor.lastrowid
92+
93+
return TAG_NAMES[name]
94+
95+
96+
def get_detection_type_id(name, cursor) -> int:
97+
if name not in DETECTION_TYPES:
98+
cursor.execute("INSERT INTO detection_types (name) VALUES (?)", (name,))
99+
DETECTION_TYPES[name] = cursor.lastrowid
100+
101+
return DETECTION_TYPES[name]
102+
103+
104+
def add_detection(scan_id, detection, package, cursor):
105+
# TODO: add severity to table cols
106+
type_id = get_detection_type_id(detection["type"], cursor)
107+
signature = xxhash.xxh64(f"{package}#{detection['signature']}").hexdigest()
108+
109+
extra = zlib.compress(rapidjson.dumps(detection.get("extra", {})).encode())
110+
111+
try:
112+
cursor.execute("""
113+
INSERT INTO detections (scan, type, signature, message, score, extra) VALUES (?, ?, ?, ?, ?, ?)
114+
""", (scan_id, type_id, signature, detection["message"], detection.get("score", 0), extra))
115+
except sqlite3.IntegrityError:
116+
print(f"Warning detection already exists: {detection}")
117+
return
118+
119+
detection_id = cursor.lastrowid
120+
121+
for tag in detection.get("tags", []):
122+
tag_id = get_tag_id(tag, cursor)
123+
try:
124+
cursor.execute("""
125+
INSERT INTO tags (detection, tag) VALUES (?, ?)
126+
""", (detection_id, tag_id))
127+
except sqlite3.IntegrityError:
128+
pass
129+
130+
131+
def add_scan(scan: dict, cursor):
132+
try:
133+
pkg_name = scan["metadata"]["uri_input"]["package"]
134+
print(f"Processing package: `{pkg_name}`")
135+
136+
cursor.execute("""
137+
INSERT INTO scans (package_name, metadata) VALUES (?, ?)
138+
""", (pkg_name, rapidjson.dumps(scan["metadata"])))
139+
except sqlite3.IntegrityError:
140+
# Data for this package already exists:
141+
return
142+
143+
scan_id = cursor.lastrowid
144+
for d in scan.get("detections", []):
145+
add_detection(scan_id, d, pkg_name, cursor)
146+
147+
148+
def process_scans(dataset_path, cursor):
149+
with open(dataset_path, "r") as fd:
150+
for line in fd:
151+
if line := line.strip():
152+
add_scan(rapidjson.loads(line), cursor)
153+
154+
155+
def main(name: str, dataset_path):
156+
c = create_or_open(name)
157+
cur = c.cursor()
158+
159+
process_scans(dataset_path, cur)
160+
161+
c.commit()
162+
163+
164+
if __name__ == "__main__":
165+
main(sys.argv[1], sys.argv[2])

0 commit comments

Comments
 (0)