Skip to content

Commit 66d6919

Browse files
G4G: Scrapes given category and generate PDF
1 parent 7eb116b commit 66d6919

File tree

1 file changed

+45
-0
lines changed

1 file changed

+45
-0
lines changed

g4g.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import requests
2+
import subprocess
3+
from bs4 import BeautifulSoup
4+
from os import system
5+
6+
7+
BASE_URL = 'http://www.geeksforgeeks.org/'
8+
9+
categoryUrl = raw_input("Enter category url: ")
10+
soup = BeautifulSoup(requests.get(BASE_URL + categoryUrl).text)
11+
12+
articles = []
13+
14+
def print_articles_to_pdf():
15+
print("All links scraped, extracting articles")
16+
allArticles = '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />'
17+
allArticles += '<br><br><br><br>'.join(articles)
18+
Html_file= open("temp.html","w")
19+
Html_file.write(allArticles)
20+
Html_file.close()
21+
print("Generating PDF GeeksForGeeks_" + categoryUrl)
22+
html_to_pdf_command = 'wkhtmltopdf temp.html GeeksForGeeks_' + categoryUrl + '.pdf'
23+
system(html_to_pdf_command)
24+
25+
26+
# Selecting links which are in the category page
27+
links = [a.attrs.get('href') for a in soup.select('article li a')]
28+
# Removing links for the categories with anchor on same page
29+
links = [link for link in links if not link.startswith('#')]
30+
31+
print("Found: " + str(len(links)) + " links")
32+
i = 1
33+
34+
for link in links:
35+
try:
36+
print("Scraping link no: " + str(i) + " Link: " + link )
37+
link_soup = BeautifulSoup(requests.get(link).text)
38+
article = link_soup.find('article')
39+
articles.append(article.encode('UTF-8'))
40+
i = i + 1
41+
except KeyboardInterrupt:
42+
break
43+
44+
45+
print_articles_to_pdf()

0 commit comments

Comments
 (0)