Skip to content

Commit 4757adc

Browse files
committed
- improve script for pypi mirror scan
- bugfix unit test if AURA_MIRROR_PATH is defined - smaller cosmetic code changes
1 parent 1a3080e commit 4757adc

File tree

6 files changed

+91
-30
lines changed

6 files changed

+91
-30
lines changed

aura/analyzers/package_enrichment.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@ def analyze(*, location: ScanLocation) -> AnalyzerReturnType:
1010
if not (pkg_name:=location.metadata.get("package_name")):
1111
return
1212

13-
pkg = package.PypiPackage.from_cached(pkg_name)
13+
if not (pkg:=location.metadata.get("package_instance")):
14+
pkg = package.PypiPackage.from_cached(pkg_name)
15+
1416
pkg_score = pkg.score
1517

1618
extra = {

aura/cache.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import concurrent.futures
1010
from abc import ABC, abstractmethod
1111
from pathlib import Path
12-
from typing import Optional, List, Generator, Iterable, BinaryIO, Tuple
12+
from typing import Optional, List, Generator, Iterable, BinaryIO, Tuple, Set
1313

1414
import click
1515
import requests
@@ -29,20 +29,29 @@ def __init__(self, path: Path):
2929
self.path = path
3030
self.metadata = loads(path.read_text())
3131

32-
self.cls = CACHE_TYPES[self.metadata["type"]]
32+
self.cls = CACHE_TYPES[self.type]
3333
self.item_path = path.parent / f"{self.cls.prefix}{self.metadata['id']}"
3434
self.item_stat = self.item_path.stat()
3535
# Used to avoid re-listing the cache content to find deleted items
3636
# Used for example in tests to assert which cache items were deleted
3737
self._deleted = False
3838

3939
@classmethod
40-
def iter_items(cls) -> Generator[CacheItem, None, None]:
40+
def iter_items(cls, tags=None) -> Generator[CacheItem, None, None]:
41+
tags = set(tags or ())
42+
4143
for x in Cache.get_location().iterdir():
4244
if not x.name.endswith(".metadata.json"):
4345
continue
4446

45-
yield cls(x)
47+
obj = cls(x)
48+
49+
if tags:
50+
if obj.type in tags or tags.intersection(obj.tags):
51+
yield obj
52+
else:
53+
yield obj
54+
4655

4756
@property
4857
def mtime(self) -> int:
@@ -59,6 +68,14 @@ def is_expired(self) -> bool:
5968
exp_threshold = get_expiration()
6069
return now > modified+exp_threshold
6170

71+
@property
72+
def type(self) -> str:
73+
return self.metadata["type"]
74+
75+
@property
76+
def tags(self) -> Set[str]:
77+
return set(self.metadata.get("tags", ()))
78+
6279
def delete(self):
6380
self.item_path.unlink(missing_ok=True)
6481
self.path.unlink(missing_ok=True)
@@ -144,10 +161,6 @@ def get_location(cls) -> Optional[Path]:
144161

145162
return cls.__location
146163

147-
@classmethod
148-
def proxy_url(cls, *, url, fd, cache_id=None):
149-
return FileDownloadCache.proxy(url=url, fd=fd, cache_id=cache_id)
150-
151164
def save_metadata(self):
152165
self.metadata_location.write_text(dumps(self.metadata))
153166

aura/uri_handlers/mirror.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,7 @@ def __init__(self, uri: ParseResult):
1717

1818
self.package_name = uri.hostname
1919
self.mirror_path = mirror.LocalMirror.get_mirror_path() # Path(uri.path)
20-
21-
if self.package_name == "$all":
22-
self.package = "$all"
23-
else:
24-
self.package = PypiPackage.from_cached(self.package_name)
20+
self.package = PypiPackage.from_cached(self.package_name)
2521

2622
self.opts.update(parse_qs(uri.query))
2723
self.comment = uri.fragment.lstrip("#")

files/scan_pypi_mirror.sh

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,57 @@
11
#!/bin/bash
22
set -e
33

4-
if [ "$#" -ne 1 ]; then
5-
echo "You must provide a path to the offline PyPI mirror web folder" >>2;
6-
exit 1;
7-
fi;
8-
9-
export AURA_MIRROR_PATH=$1;
104
export AURA_ALL_MODULE_IMPORTS=true;
115
export PYTHONWARNINGS=ignore;
12-
export TEMPDIR=$(dirname $(mktemp -u))
6+
export OUTDIR=${AURA_SCAN_DIR:=aura_mirror_scan}
7+
8+
9+
if [[ -z "${AURA_MIRROR_PATH}" ]]; then
10+
echo "You must set the AURA_MIRROR_PATH env variable!" >>2;
11+
exit 1
12+
fi;
13+
14+
15+
# Create directory structure
16+
[ -d $OUTDIR ] || mkdir $OUTDIR
17+
[ -d $OUTDIR/package_errors ] || mkdir -p $OUTDIR/package_errors
18+
[ -d $OUTDIR/package_results ] || mkdir -p $OUTDIR/package_results
1319

1420

1521
if [ ! -d "${AURA_MIRROR_PATH}/json" ]; then
1622
echo "JSON directory not found at ${AURA_MIRROR_PATH}. You probably have not provided a correct path to the web mirror directory" >>2;
1723
exit 1;
1824
fi
1925

20-
if [ ! -f "aura_mirror_scan/package_cache" ]; then
21-
ls $AURA_MIRROR_PATH/json >aura_mirror_scan/package_cache;
26+
if [ ! -f "$OUTDIR/package_cache" ]; then
27+
if [ -f $AURA_MIRROR_PATH/pypi_package_list.txt]; then
28+
cp $AURA_MIRROR_PATH/pypi_package_list.txt $OUTDIR/package_cache;
29+
else
30+
ls $AURA_MIRROR_PATH/json >$OUTDIR/package_cache;
2231
fi
2332

2433

25-
PKGS=$(cat aura_mirror_scan/package_cache)
34+
PKGS=$(cat $OUTDIR/package_cache)
2635

2736
scan() {
28-
AURA_LOG_LEVEL="ERROR" AURA_NO_PROGRESS=true aura scan -f json mirror://$1 -v 1> >(tee -a "aura_mirror_scan/$1.results.json" |jq .) 2> >(tee -a aura_mirror_scan/$1.errors.log >&2)
37+
ERROR_FILE=$OUTDIR/package_errors/$1.errors.log
38+
RESULTS_FILE=$OUTDIR/package_results/$1.results.json
39+
40+
AURA_LOG_LEVEL="ERROR" AURA_NO_PROGRESS=true aura scan -f json mirror://$1 -v 1> >(tee -a $RESULTS_FILE |jq .) 2> >(tee -a $ERROR_FILE >&2)
2941
if [ $? -ne 0 ]; then
30-
echo $1 >>aura_mirror_scan/failed_packages.log
42+
echo $1 >>$OUTDIR/failed_packages.log
3143
else
32-
echo $1 >>aura_mirror_scan/processed_packages.log
44+
echo $1 >>$OUTDIR/processed_packages.log
45+
fi
46+
47+
if [ -s $RESULTS_FILE ]; then
48+
echo "Removing empty $RESULTS_FILE"
49+
rm $RESULTS_FILE
3350
fi
3451

35-
if [ -s aura_mirror_scan/$1.errors.log ]; then
36-
rm aura_mirror_scan/$1.errors.log
52+
if [ -s $ERROR_FILE ]; then
53+
echo "Removing empty $ERROR_FILE"
54+
rm $ERROR_FILE
3755
fi
3856

3957
}
@@ -42,4 +60,4 @@ export -f scan
4260

4361
echo "Starting Aura scan"
4462

45-
echo $PKGS|tr ' \r' '\n'| parallel --memfree 5G -j30 --progress --resume --timeout 1200 --joblog ${TEMPDIR}/aura_pypi_scan_joblog --max-args 1 scan
63+
echo $PKGS|tr ' \r' '\n'| parallel --memfree 5G -j30 --progress --resume-failed --timeout 1200 --joblog $OUTDIR/joblog --max-args 1 scan

tests/conftest.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020

2121
os.environ["AURA_NO_CACHE"] = "true"
2222

23+
if "AURA_MIRROR_PATH" in os.environ:
24+
os.unsetenv("AURA_MIRROR_PATH")
25+
2326

2427
# Definition used to replicate the PyPI mirror file system structure
2528
MIRROR_FILES = {

tests/test_cache.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,3 +300,32 @@ def test_ast_cache(ast_compile_mock, patterns_hash_mock, mock_cache):
300300

301301
cache.CacheItem.cleanup()
302302
assert len(tuple(cache.CacheItem.iter_items())) == 0
303+
304+
305+
@responses.activate
306+
def test_cache_tag_filtering(mock_cache, fixtures):
307+
def _cb(request):
308+
return (200, {}, "Hello world")
309+
310+
responses.add_callback(method=responses.GET, url="http://example.com/tag_test", match_querystring=False, callback=_cb)
311+
312+
tags = {"tag1", "tag2", "tag3"}
313+
314+
for tag in tags:
315+
url = f"http://example.com/tag_test?tag={tag}"
316+
cache.URLCache.proxy(url=url, tags=[tag])
317+
318+
319+
all_items = tuple(cache.CacheItem.iter_items())
320+
assert len(all_items) == len(tags)
321+
322+
items = tuple(cache.CacheItem.iter_items(tags=["non-existing"]))
323+
assert len(items) == 0
324+
325+
items_by_type = tuple(cache.CacheItem.iter_items(tags=["url"]))
326+
assert len(items_by_type) == len(all_items)
327+
328+
for tag in tags:
329+
items = tuple(cache.CacheItem.iter_items(tags=[tag]))
330+
assert len(items) == 1
331+
assert tag in items[0].tags

0 commit comments

Comments
 (0)