From 190ffb57b22f455d465f41507346815a9f40fd7c Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Tue, 25 Nov 2025 20:21:32 +0800 Subject: [PATCH] feat(pytorch): Add PyTorch 2.8 and 2.9 documentation support This commit updates the PyTorch scraper for documentation versions 2.8 and 2.9, addressing changes in the theme and HTML structure. Key changes: - Identifies the main content area correctly in newer version docs. - Supports the new breadcrumb navigation structure. - Restore truncated entry names in newer docs using the full page title, maintaining consistent naming conventions. --- lib/docs/filters/pytorch/clean_html.rb | 8 ++++--- lib/docs/filters/pytorch/entries.rb | 30 ++++++++++++++++++++++---- lib/docs/scrapers/pytorch.rb | 10 +++++++++ 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/lib/docs/filters/pytorch/clean_html.rb b/lib/docs/filters/pytorch/clean_html.rb index dd19c3e0f7..455b939ee4 100644 --- a/lib/docs/filters/pytorch/clean_html.rb +++ b/lib/docs/filters/pytorch/clean_html.rb @@ -2,9 +2,11 @@ module Docs class Pytorch class CleanHtmlFilter < Filter def call - @doc = at_css('.pytorch-article') - # Show katex-mathml nodes and remove katex-html nodes - css('.katex-html').remove + if root = at_css('#pytorch-article') + @doc = root + # Show katex-mathml nodes and remove katex-html nodes + css('.katex-html').remove + end doc end end diff --git a/lib/docs/filters/pytorch/entries.rb b/lib/docs/filters/pytorch/entries.rb index 37e26e3129..85d2f06418 100644 --- a/lib/docs/filters/pytorch/entries.rb +++ b/lib/docs/filters/pytorch/entries.rb @@ -2,9 +2,23 @@ module Docs class Pytorch class EntriesFilter < Docs::EntriesFilter def get_breadcrumbs - css('.pytorch-breadcrumbs > li').map { - |node| node.content.delete_suffix(' >').strip - }.reject { |item| item.nil? || item.empty? } + breadcrumbs = if at_css('.pytorch-breadcrumbs') + css('.pytorch-breadcrumbs > li').map { |node| + node.content.delete_suffix(' >').strip + } + else + css('.bd-breadcrumbs > li').map { |node| + text = node.content.strip + text.empty? && node.at_css('.fa-home') ? 'Docs' : text + } + end.reject { |item| item.nil? || item.empty? } + + if breadcrumbs.last&.end_with?('.') + resolved_name = at_css('h1').content.delete_suffix('#').strip + breadcrumbs[-1] = resolved_name + end + + breadcrumbs end def get_name @@ -12,7 +26,15 @@ def get_name end def get_type - get_breadcrumbs[1] + if at_css('.pytorch-breadcrumbs') + get_breadcrumbs[1] + else + get_breadcrumbs.size > 2 ? get_breadcrumbs[2] : get_breadcrumbs[1] + end + end + + def include_default_entry? + !get_breadcrumbs.nil? && get_breadcrumbs.size >= 2 end def additional_entries diff --git a/lib/docs/scrapers/pytorch.rb b/lib/docs/scrapers/pytorch.rb index cfa1d51010..e45ac3b096 100644 --- a/lib/docs/scrapers/pytorch.rb +++ b/lib/docs/scrapers/pytorch.rb @@ -19,6 +19,16 @@ class Pytorch < UrlScraper PyTorch has a BSD-style license, as found in the LICENSE file. HTML + version '2.9' do + self.release = '2.9' + self.base_url = "https://docs.pytorch.org/docs/#{release}/" + end + + version '2.8' do + self.release = '2.8' + self.base_url = "https://docs.pytorch.org/docs/#{release}/" + end + version '2.7' do self.release = '2.7' self.base_url = "https://docs.pytorch.org/docs/#{release}/"