File tree Expand file tree Collapse file tree 1 file changed +45
-0
lines changed
Expand file tree Collapse file tree 1 file changed +45
-0
lines changed Original file line number Diff line number Diff line change 1+ import requests
2+ import subprocess
3+ from bs4 import BeautifulSoup
4+ from os import system
5+
6+
7+ BASE_URL = 'http://www.geeksforgeeks.org/'
8+
9+ categoryUrl = raw_input ("Enter category url: " )
10+ soup = BeautifulSoup (requests .get (BASE_URL + categoryUrl ).text )
11+
12+ articles = []
13+
14+ def print_articles_to_pdf ():
15+ print ("All links scraped, extracting articles" )
16+ allArticles = '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />'
17+ allArticles += '<br><br><br><br>' .join (articles )
18+ Html_file = open ("temp.html" ,"w" )
19+ Html_file .write (allArticles )
20+ Html_file .close ()
21+ print ("Generating PDF GeeksForGeeks_" + categoryUrl )
22+ html_to_pdf_command = 'wkhtmltopdf temp.html GeeksForGeeks_' + categoryUrl + '.pdf'
23+ system (html_to_pdf_command )
24+
25+
26+ # Selecting links which are in the category page
27+ links = [a .attrs .get ('href' ) for a in soup .select ('article li a' )]
28+ # Removing links for the categories with anchor on same page
29+ links = [link for link in links if not link .startswith ('#' )]
30+
31+ print ("Found: " + str (len (links )) + " links" )
32+ i = 1
33+
34+ for link in links :
35+ try :
36+ print ("Scraping link no: " + str (i ) + " Link: " + link )
37+ link_soup = BeautifulSoup (requests .get (link ).text )
38+ article = link_soup .find ('article' )
39+ articles .append (article .encode ('UTF-8' ))
40+ i = i + 1
41+ except KeyboardInterrupt :
42+ break
43+
44+
45+ print_articles_to_pdf ()
You can’t perform that action at this time.
0 commit comments