Skip to content

Commit 7c4fc81

Browse files
Styled Results in html and pdf form
1 parent 66d6919 commit 7c4fc81

File tree

2 files changed

+87
-27
lines changed

2 files changed

+87
-27
lines changed

g4g.py

100644100755
Lines changed: 86 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,104 @@
1+
#!/usr/bin/python
2+
13
import requests
2-
import subprocess
34
from bs4 import BeautifulSoup
45
from os import system
5-
6+
from sys import exit
7+
from time import sleep
8+
from requests.exceptions import ConnectionError
69

710
BASE_URL = 'http://www.geeksforgeeks.org/'
11+
articles = []
812

9-
categoryUrl = raw_input("Enter category url: ")
10-
soup = BeautifulSoup(requests.get(BASE_URL + categoryUrl).text)
13+
choice_to_category = {1: 'c', 2: 'c-plus-plus', 3: 'java',
14+
4: 'fundamental-of-algorithms',
15+
5: 'data-structures'}
16+
17+
def display_menu():
18+
print("Choose category to scrape: ")
19+
print("1. C Language")
20+
print("2. C++ Language")
21+
print("3. Java")
22+
print("4. Algorithms")
23+
print("5. Data Structures")
1124

12-
articles = []
1325

14-
def print_articles_to_pdf():
26+
def get_category_choice():
27+
choice = int(raw_input("Enter choice: "))
28+
try:
29+
categoryUrl = choice_to_category[choice]
30+
except KeyError:
31+
print("Wrong Choice Entered. Exiting!")
32+
exit(1)
33+
return categoryUrl
34+
35+
36+
def save_articles_as_html_and_pdf():
1537
print("All links scraped, extracting articles")
16-
allArticles = '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />'
17-
allArticles += '<br><br><br><br>'.join(articles)
18-
Html_file= open("temp.html","w")
38+
# Formatting the html for articles
39+
allArticles = ('<!DOCTYPE html>'
40+
'<html><head>'
41+
'<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />'
42+
'<link rel="stylesheet" href="style.min.css" type="text/css" media="all" />'
43+
'</head><body>'
44+
)
45+
allArticles += '<h1 style="text-align:center;font-size:40px">' + categoryUrl.title() + ' Archive</h1><hr>'
46+
allArticles += '<hr>'.join(articles)
47+
allArticles += '''
48+
</body></html>
49+
'''
50+
html_file_name = 'G4G_' + categoryUrl.title() + '.html'
51+
Html_file= open(html_file_name, "w")
1952
Html_file.write(allArticles)
2053
Html_file.close()
21-
print("Generating PDF GeeksForGeeks_" + categoryUrl)
22-
html_to_pdf_command = 'wkhtmltopdf temp.html GeeksForGeeks_' + categoryUrl + '.pdf'
54+
pdf_file_name = 'G4G_' + categoryUrl.title() + '.pdf'
55+
print("Generating PDF " + pdf_file_name)
56+
html_to_pdf_command = 'wkhtmltopdf ' + html_file_name + ' ' + pdf_file_name
2357
system(html_to_pdf_command)
2458

2559

26-
# Selecting links which are in the category page
27-
links = [a.attrs.get('href') for a in soup.select('article li a')]
28-
# Removing links for the categories with anchor on same page
29-
links = [link for link in links if not link.startswith('#')]
60+
def scrape_category(categoryUrl):
61+
try:
62+
soup = BeautifulSoup(requests.get(BASE_URL + categoryUrl).text)
63+
except ConnectionError:
64+
print("Couldn't connect to Internet! Please check your connection & Try again.")
65+
exit(1)
66+
# Selecting links which are in the category page
67+
links = [a.attrs.get('href') for a in soup.select('article li a')]
68+
# Removing links for the categories with anchor on same page
69+
links = [link for link in links if not link.startswith('#')]
3070

31-
print("Found: " + str(len(links)) + " links")
32-
i = 1
71+
print("Found: " + str(len(links)) + " links")
72+
i = 1
3373

34-
for link in links:
35-
try:
36-
print("Scraping link no: " + str(i) + " Link: " + link )
37-
link_soup = BeautifulSoup(requests.get(link).text)
38-
article = link_soup.find('article')
39-
articles.append(article.encode('UTF-8'))
40-
i = i + 1
41-
except KeyboardInterrupt:
42-
break
74+
# Traverse each link to find article and save it.
75+
for link in links:
76+
try:
77+
if(i % 11 == 0):
78+
sleep(5) # Sleep for 5 seconds after getting every 10th link
79+
print("Scraping link no: " + str(i) + " Link: " + link )
80+
i = i + 1
81+
link_soup = BeautifulSoup(requests.get(link).text)
82+
# Remove the space occupied by Google Ads (Drop script & ins node)
83+
[script.extract() for script in link_soup(["script", "ins"])]
84+
article = link_soup.find('article')
85+
# Now add this article to list of all articles
86+
articles.append(article.encode('UTF-8'))
87+
# Sometimes hanging. So Ctrl ^ C, and try the next link.
88+
# Find out the reason & improve this.
89+
except KeyboardInterrupt:
90+
continue
91+
except ConnectionError:
92+
print("Internet disconnected! Please check your connection & Try again.")
93+
if articles:
94+
print("Making PDF of links scraped till now.")
95+
break
96+
else:
97+
exit(1)
4398

4499

45-
print_articles_to_pdf()
100+
if __name__ == '__main__':
101+
display_menu()
102+
categoryUrl = get_category_choice()
103+
scrape_category(categoryUrl)
104+
save_articles_as_html_and_pdf()

style.min.css

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)