diff --git a/examples/html-py-ever/Cargo.lock b/examples/html-py-ever/Cargo.lock index b54ea051..51a77522 100644 --- a/examples/html-py-ever/Cargo.lock +++ b/examples/html-py-ever/Cargo.lock @@ -8,12 +8,6 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - [[package]] name = "bitflags" version = "2.5.0" @@ -32,27 +26,17 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" -[[package]] -name = "convert_case" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" - [[package]] name = "cssparser" -version = "0.27.2" +version = "0.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "754b69d351cdc2d8ee09ae203db831e005560fc6030da058f86ad60c92a9cb0a" +checksum = "b7c66d1cd8ed61bf80b38432613a7a2f09401ab8d0501110655f8b341484a3e3" dependencies = [ "cssparser-macros", "dtoa-short", "itoa", - "matches", "phf", - "proc-macro2", - "quote", "smallvec", - "syn 1.0.109", ] [[package]] @@ -71,10 +55,8 @@ version = "0.99.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321" dependencies = [ - "convert_case", "proc-macro2", "quote", - "rustc_version", "syn 1.0.109", ] @@ -93,6 +75,12 @@ dependencies = [ "dtoa", ] +[[package]] +name = "ego-tree" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8" + [[package]] name = "futf" version = "0.1.5" @@ -113,14 +101,12 @@ dependencies = [ ] [[package]] -name = "getrandom" -version = "0.1.16" +name = "getopts" +version = "0.2.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df" dependencies = [ - "cfg-if", - "libc", - "wasi 0.9.0+wasi-snapshot-preview1", + "unicode-width", ] [[package]] @@ -131,7 +117,7 @@ checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "libc", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi", ] [[package]] @@ -144,23 +130,20 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" name = "html-py-ever" version = "0.1.0" dependencies = [ - "kuchiki", "pyo3", - "tendril", + "scraper", ] [[package]] name = "html5ever" -version = "0.25.2" +version = "0.29.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5c13fb08e5d4dfc151ee5e88bae63f7773d61852f3bdc73c9f4b9e1bde03148" +checksum = "3b7410cae13cbc75623c98ac4cbfd1f0bedddf3227afc24f370cf0f50a44a11c" dependencies = [ "log", "mac", "markup5ever", - "proc-macro2", - "quote", - "syn 1.0.109", + "match_token", ] [[package]] @@ -171,21 +154,9 @@ checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" [[package]] name = "itoa" -version = "0.4.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" - -[[package]] -name = "kuchiki" -version = "0.8.1" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ea8e9c6e031377cff82ee3001dc8026cdf431ed4e2e6b51f98ab8c73484a358" -dependencies = [ - "cssparser", - "html5ever", - "matches", - "selectors", -] +checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" [[package]] name = "libc" @@ -217,9 +188,9 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "markup5ever" -version = "0.10.1" +version = "0.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a24f40fb03852d1cdd84330cddcaf98e9ec08a7b7768e952fad3b4cf048ec8fd" +checksum = "c7a7213d12e1864c0f002f52c2923d4556935a43dec5e71355c2760e0f6e7a18" dependencies = [ "log", "phf", @@ -230,10 +201,15 @@ dependencies = [ ] [[package]] -name = "matches" -version = "0.1.10" +name = "match_token" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5" +checksum = "88a9689d8d44bf9964484516275f5cd4c9b59457a6940c1d5d0ecbb94510a36b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.61", +] [[package]] name = "memoffset" @@ -250,12 +226,6 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" -[[package]] -name = "nodrop" -version = "0.1.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" - [[package]] name = "once_cell" version = "1.21.3" @@ -287,75 +257,73 @@ dependencies = [ [[package]] name = "phf" -version = "0.8.0" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" dependencies = [ "phf_macros", - "phf_shared 0.8.0", - "proc-macro-hack", + "phf_shared 0.11.3", ] [[package]] name = "phf_codegen" -version = "0.8.0" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" dependencies = [ - "phf_generator 0.8.0", - "phf_shared 0.8.0", + "phf_generator 0.11.3", + "phf_shared 0.11.3", ] [[package]] name = "phf_generator" -version = "0.8.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" +checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" dependencies = [ - "phf_shared 0.8.0", - "rand 0.7.3", + "phf_shared 0.10.0", + "rand", ] [[package]] name = "phf_generator" -version = "0.10.0" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" dependencies = [ - "phf_shared 0.10.0", - "rand 0.8.5", + "phf_shared 0.11.3", + "rand", ] [[package]] name = "phf_macros" -version = "0.8.0" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f6fde18ff429ffc8fe78e2bf7f8b7a5a5a6e2a8b58bc5a9ac69198bbda9189c" +checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" dependencies = [ - "phf_generator 0.8.0", - "phf_shared 0.8.0", - "proc-macro-hack", + "phf_generator 0.11.3", + "phf_shared 0.11.3", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.61", ] [[package]] name = "phf_shared" -version = "0.8.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" dependencies = [ - "siphasher", + "siphasher 0.3.11", ] [[package]] name = "phf_shared" -version = "0.10.0" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" dependencies = [ - "siphasher", + "siphasher 1.0.2", ] [[package]] @@ -376,12 +344,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" -[[package]] -name = "proc-macro-hack" -version = "0.5.20+deprecated" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" - [[package]] name = "proc-macro2" version = "1.0.82" @@ -461,20 +423,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "rand" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" -dependencies = [ - "getrandom 0.1.16", - "libc", - "rand_chacha 0.2.2", - "rand_core 0.5.1", - "rand_hc", - "rand_pcg", -] - [[package]] name = "rand" version = "0.8.5" @@ -482,18 +430,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha 0.3.1", - "rand_core 0.6.4", -] - -[[package]] -name = "rand_chacha" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" -dependencies = [ - "ppv-lite86", - "rand_core 0.5.1", + "rand_chacha", + "rand_core", ] [[package]] @@ -503,16 +441,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core 0.6.4", -] - -[[package]] -name = "rand_core" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" -dependencies = [ - "getrandom 0.1.16", + "rand_core", ] [[package]] @@ -521,25 +450,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom 0.2.15", -] - -[[package]] -name = "rand_hc" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" -dependencies = [ - "rand_core 0.5.1", -] - -[[package]] -name = "rand_pcg" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" -dependencies = [ - "rand_core 0.5.1", + "getrandom", ] [[package]] @@ -548,16 +459,7 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "469052894dcb553421e483e4209ee581a45100d31b4018de03e5a7ad86374a7e" dependencies = [ - "bitflags 2.5.0", -] - -[[package]] -name = "rustc_version" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" -dependencies = [ - "semver", + "bitflags", ] [[package]] @@ -567,31 +469,39 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] -name = "selectors" +name = "scraper" version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df320f1889ac4ba6bc0cdc9c9af7af4bd64bb927bccdf32d81140dc1f9be12fe" +checksum = "cc3d051b884f40e309de6c149734eab57aa8cc1347992710dc80bcc1c2194c15" dependencies = [ - "bitflags 1.3.2", + "cssparser", + "ego-tree", + "getopts", + "html5ever", + "precomputed-hash", + "selectors", + "tendril", +] + +[[package]] +name = "selectors" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd568a4c9bb598e291a08244a5c1f5a8a6650bee243b5b0f8dbb3d9cc1d87fe8" +dependencies = [ + "bitflags", "cssparser", "derive_more", "fxhash", "log", - "matches", + "new_debug_unreachable", "phf", "phf_codegen", "precomputed-hash", "servo_arc", "smallvec", - "thin-slice", ] -[[package]] -name = "semver" -version = "1.0.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" - [[package]] name = "serde" version = "1.0.200" @@ -614,11 +524,10 @@ dependencies = [ [[package]] name = "servo_arc" -version = "0.1.1" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432" +checksum = "170fb83ab34de17dc69aa7c67482b22218ddb85da56546f9bd6b929e32a05930" dependencies = [ - "nodrop", "stable_deref_trait", ] @@ -628,6 +537,12 @@ version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" +[[package]] +name = "siphasher" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" + [[package]] name = "smallvec" version = "1.13.2" @@ -705,18 +620,18 @@ dependencies = [ "utf-8", ] -[[package]] -name = "thin-slice" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c" - [[package]] name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + [[package]] name = "unindent" version = "0.2.3" @@ -729,12 +644,6 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" -[[package]] -name = "wasi" -version = "0.9.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" diff --git a/examples/html-py-ever/Cargo.toml b/examples/html-py-ever/Cargo.toml index f943a8e1..d8fa4e12 100644 --- a/examples/html-py-ever/Cargo.toml +++ b/examples/html-py-ever/Cargo.toml @@ -5,9 +5,8 @@ authors = ["konstin "] edition = "2021" [dependencies] -kuchiki = "0.8.0" +scraper = "0.22" pyo3 = "0.27" -tendril = "0.4.3" [lib] name = "html_py_ever" diff --git a/examples/html-py-ever/README.md b/examples/html-py-ever/README.md index 4e3c3b28..e6fac0e3 100644 --- a/examples/html-py-ever/README.md +++ b/examples/html-py-ever/README.md @@ -1,6 +1,6 @@ # html-py-ever -Demoing how to use [html5ever](https://github.com/servo/html5ever) through [kuchiki](https://github.com/kuchiki-rs/kuchiki) to speed up html parsing and css-selecting. +Demoing how to use [html5ever](https://github.com/servo/html5ever) through [scraper](https://github.com/rust-scraper/scraper) to speed up html parsing and css-selecting. ## Usage diff --git a/examples/html-py-ever/noxfile.py b/examples/html-py-ever/noxfile.py index ed5da892..8976f4eb 100644 --- a/examples/html-py-ever/noxfile.py +++ b/examples/html-py-ever/noxfile.py @@ -12,3 +12,12 @@ def test(session: nox.Session): session.install("--no-build-isolation", ".") # Test Python package session.run("pytest", *session.posargs) + + +@nox.session() +def bench(session: nox.Session): + session.install(SETUPTOOLS_RUST, "pytest", "pytest-benchmark", "beautifulsoup4") + # Ensure build uses version of setuptools-rust under development + session.install("--no-build-isolation", ".") + # Test Python package + session.run("pytest", "--benchmark-enable", *session.posargs) diff --git a/examples/html-py-ever/pyproject.toml b/examples/html-py-ever/pyproject.toml index e368fbdb..23e6e098 100644 --- a/examples/html-py-ever/pyproject.toml +++ b/examples/html-py-ever/pyproject.toml @@ -36,3 +36,6 @@ target = "html_py_ever.html_py_ever" # ^-- The last part of the name (e.g. "html_py_ever") has to match lib.name in Cargo.toml, # but you can add a prefix to nest it inside of a Python package. # See reference for RustExtension in https://setuptools-rust.readthedocs.io/en/latest/reference.html +# +[tool.pytest.ini_options] +addopts = "--benchmark-disable" diff --git a/examples/html-py-ever/pytest.ini b/examples/html-py-ever/pytest.ini deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/html-py-ever/rust/lib.rs b/examples/html-py-ever/rust/lib.rs index 6558418a..b731a702 100644 --- a/examples/html-py-ever/rust/lib.rs +++ b/examples/html-py-ever/rust/lib.rs @@ -3,37 +3,43 @@ use pyo3::prelude::*; #[pymodule] mod html_py_ever { use pyo3::prelude::*; + use scraper::{Html, Selector}; + use std::fs; use std::io::Read; use std::path::Path; - use tendril::stream::TendrilSink; /// A parsed html document #[pyclass(unsendable)] struct Document { - node: kuchiki::NodeRef, + html: Html, } #[pymethods] impl Document { /// Returns the selected elements as strings - fn select(&self, selector: &str) -> Vec { - self.node - .select(selector) - .unwrap() - .map(|css_match| css_match.text_contents()) - .collect() + fn select(&self, selector: &str) -> PyResult> { + let selector = Selector::parse(selector) + .map_err(|e| PyErr::new::(format!("{e:?}")))?; + Ok(self + .html + .select(&selector) + .map(|element| element.html()) + .collect()) } } impl Document { fn from_reader(reader: &mut impl Read) -> PyResult { - let node = kuchiki::parse_html().from_utf8().read_from(reader)?; - Ok(Document { node }) + let mut html_string = String::new(); + reader.read_to_string(&mut html_string)?; + let html = Html::parse_document(&html_string); + Ok(Document { html }) } fn from_file(path: &Path) -> PyResult { - let node = kuchiki::parse_html().from_utf8().from_file(path)?; - Ok(Document { node }) + let html_string = fs::read_to_string(path)?; + let html = Html::parse_document(&html_string); + Ok(Document { html }) } } diff --git a/examples/html-py-ever/rust/main.rs b/examples/html-py-ever/rust/main.rs index e1748e49..340b4dad 100644 --- a/examples/html-py-ever/rust/main.rs +++ b/examples/html-py-ever/rust/main.rs @@ -1,11 +1,11 @@ //! Pure rust version for comparing with python based calls -use kuchiki; +use scraper::{Html, Selector}; use std::env; +use std::fs; use std::path::PathBuf; use std::time::Instant; -use tendril::stream::TendrilSink; fn main() { let path = PathBuf::from( @@ -15,13 +15,15 @@ fn main() { ); let now = Instant::now(); - let document = kuchiki::parse_html().from_utf8().from_file(&path).unwrap(); + let html_string = fs::read_to_string(&path).unwrap(); + let document = Html::parse_document(&html_string); println!("{:?}", now.elapsed()); + let now2 = Instant::now(); + let selector = Selector::parse("a[href]").unwrap(); let links: Vec = document - .select("a[href]") - .unwrap() - .map(|css_match| css_match.text_contents()) + .select(&selector) + .map(|element| element.text().collect()) .collect(); println!("{} {:?}", links.len(), now2.elapsed()); } diff --git a/examples/html-py-ever/tests/test_selector.py b/examples/html-py-ever/tests/test_selector.py index 862c9b12..8ac3e187 100755 --- a/examples/html-py-ever/tests/test_selector.py +++ b/examples/html-py-ever/tests/test_selector.py @@ -1,12 +1,11 @@ #!/usr/bin/env python -from glob import glob import os +from glob import glob import html_py_ever import pytest from bs4 import BeautifulSoup - HTML_FILES = glob(os.path.join(os.path.dirname(__file__), "*.html"))