Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
277 changes: 93 additions & 184 deletions examples/html-py-ever/Cargo.lock

Large diffs are not rendered by default.

3 changes: 1 addition & 2 deletions examples/html-py-ever/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@ authors = ["konstin <konstin@mailbox.org>"]
edition = "2021"

[dependencies]
kuchiki = "0.8.0"
scraper = "0.22"
pyo3 = "0.27"
tendril = "0.4.3"

[lib]
name = "html_py_ever"
Expand Down
2 changes: 1 addition & 1 deletion examples/html-py-ever/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# html-py-ever

Demoing how to use [html5ever](https://github.com/servo/html5ever) through [kuchiki](https://github.com/kuchiki-rs/kuchiki) to speed up html parsing and css-selecting.
Demoing how to use [html5ever](https://github.com/servo/html5ever) through [scraper](https://github.com/rust-scraper/scraper) to speed up html parsing and css-selecting.

## Usage

Expand Down
9 changes: 9 additions & 0 deletions examples/html-py-ever/noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,12 @@ def test(session: nox.Session):
session.install("--no-build-isolation", ".")
# Test Python package
session.run("pytest", *session.posargs)


@nox.session()
def bench(session: nox.Session):
session.install(SETUPTOOLS_RUST, "pytest", "pytest-benchmark", "beautifulsoup4")
# Ensure build uses version of setuptools-rust under development
session.install("--no-build-isolation", ".")
# Test Python package
session.run("pytest", "--benchmark-enable", *session.posargs)
3 changes: 3 additions & 0 deletions examples/html-py-ever/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,6 @@ target = "html_py_ever.html_py_ever"
# ^-- The last part of the name (e.g. "html_py_ever") has to match lib.name in Cargo.toml,
# but you can add a prefix to nest it inside of a Python package.
# See reference for RustExtension in https://setuptools-rust.readthedocs.io/en/latest/reference.html
#
[tool.pytest.ini_options]
addopts = "--benchmark-disable"
Empty file removed examples/html-py-ever/pytest.ini
Empty file.
30 changes: 18 additions & 12 deletions examples/html-py-ever/rust/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,37 +3,43 @@ use pyo3::prelude::*;
#[pymodule]
mod html_py_ever {
use pyo3::prelude::*;
use scraper::{Html, Selector};
use std::fs;
use std::io::Read;
use std::path::Path;
use tendril::stream::TendrilSink;

/// A parsed html document
#[pyclass(unsendable)]
struct Document {
node: kuchiki::NodeRef,
html: Html,
}

#[pymethods]
impl Document {
/// Returns the selected elements as strings
fn select(&self, selector: &str) -> Vec<String> {
self.node
.select(selector)
.unwrap()
.map(|css_match| css_match.text_contents())
.collect()
fn select(&self, selector: &str) -> PyResult<Vec<String>> {
let selector = Selector::parse(selector)
.map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(format!("{e:?}")))?;
Ok(self
.html
.select(&selector)
.map(|element| element.html())
.collect())
}
}

impl Document {
fn from_reader(reader: &mut impl Read) -> PyResult<Document> {
let node = kuchiki::parse_html().from_utf8().read_from(reader)?;
Ok(Document { node })
let mut html_string = String::new();
reader.read_to_string(&mut html_string)?;
let html = Html::parse_document(&html_string);
Ok(Document { html })
}

fn from_file(path: &Path) -> PyResult<Document> {
let node = kuchiki::parse_html().from_utf8().from_file(path)?;
Ok(Document { node })
let html_string = fs::read_to_string(path)?;
let html = Html::parse_document(&html_string);
Ok(Document { html })
}
}

Expand Down
14 changes: 8 additions & 6 deletions examples/html-py-ever/rust/main.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
//! Pure rust version for comparing with python based calls

use kuchiki;
use scraper::{Html, Selector};

use std::env;
use std::fs;
use std::path::PathBuf;
use std::time::Instant;
use tendril::stream::TendrilSink;

fn main() {
let path = PathBuf::from(
Expand All @@ -15,13 +15,15 @@ fn main() {
);

let now = Instant::now();
let document = kuchiki::parse_html().from_utf8().from_file(&path).unwrap();
let html_string = fs::read_to_string(&path).unwrap();
let document = Html::parse_document(&html_string);
println!("{:?}", now.elapsed());

let now2 = Instant::now();
let selector = Selector::parse("a[href]").unwrap();
let links: Vec<String> = document
.select("a[href]")
.unwrap()
.map(|css_match| css_match.text_contents())
.select(&selector)
.map(|element| element.text().collect())
.collect();
println!("{} {:?}", links.len(), now2.elapsed());
}
3 changes: 1 addition & 2 deletions examples/html-py-ever/tests/test_selector.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
#!/usr/bin/env python
from glob import glob
import os
from glob import glob

import html_py_ever
import pytest
from bs4 import BeautifulSoup


HTML_FILES = glob(os.path.join(os.path.dirname(__file__), "*.html"))


Expand Down
Loading