|
| 1 | +#!/usr/bin/python |
| 2 | + |
1 | 3 | import requests |
2 | | -import subprocess |
3 | 4 | from bs4 import BeautifulSoup |
4 | 5 | from os import system |
5 | | - |
| 6 | +from sys import exit |
| 7 | +from time import sleep |
| 8 | +from requests.exceptions import ConnectionError |
6 | 9 |
|
7 | 10 | BASE_URL = 'http://www.geeksforgeeks.org/' |
| 11 | +articles = [] |
8 | 12 |
|
9 | | -categoryUrl = raw_input("Enter category url: ") |
10 | | -soup = BeautifulSoup(requests.get(BASE_URL + categoryUrl).text) |
| 13 | +choice_to_category = {1: 'c', 2: 'c-plus-plus', 3: 'java', |
| 14 | + 4: 'fundamental-of-algorithms', |
| 15 | + 5: 'data-structures'} |
| 16 | + |
| 17 | +def display_menu(): |
| 18 | + print("Choose category to scrape: ") |
| 19 | + print("1. C Language") |
| 20 | + print("2. C++ Language") |
| 21 | + print("3. Java") |
| 22 | + print("4. Algorithms") |
| 23 | + print("5. Data Structures") |
11 | 24 |
|
12 | | -articles = [] |
13 | 25 |
|
14 | | -def print_articles_to_pdf(): |
| 26 | +def get_category_choice(): |
| 27 | + choice = int(raw_input("Enter choice: ")) |
| 28 | + try: |
| 29 | + categoryUrl = choice_to_category[choice] |
| 30 | + except KeyError: |
| 31 | + print("Wrong Choice Entered. Exiting!") |
| 32 | + exit(1) |
| 33 | + return categoryUrl |
| 34 | + |
| 35 | + |
| 36 | +def save_articles_as_html_and_pdf(): |
15 | 37 | print("All links scraped, extracting articles") |
16 | | - allArticles = '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />' |
17 | | - allArticles += '<br><br><br><br>'.join(articles) |
18 | | - Html_file= open("temp.html","w") |
| 38 | + # Formatting the html for articles |
| 39 | + allArticles = ('<!DOCTYPE html>' |
| 40 | + '<html><head>' |
| 41 | + '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />' |
| 42 | + '<link rel="stylesheet" href="style.min.css" type="text/css" media="all" />' |
| 43 | + '</head><body>' |
| 44 | + ) |
| 45 | + allArticles += '<h1 style="text-align:center;font-size:40px">' + categoryUrl.title() + ' Archive</h1><hr>' |
| 46 | + allArticles += '<hr>'.join(articles) |
| 47 | + allArticles += ''' |
| 48 | + </body></html> |
| 49 | + ''' |
| 50 | + html_file_name = 'G4G_' + categoryUrl.title() + '.html' |
| 51 | + Html_file= open(html_file_name, "w") |
19 | 52 | Html_file.write(allArticles) |
20 | 53 | Html_file.close() |
21 | | - print("Generating PDF GeeksForGeeks_" + categoryUrl) |
22 | | - html_to_pdf_command = 'wkhtmltopdf temp.html GeeksForGeeks_' + categoryUrl + '.pdf' |
| 54 | + pdf_file_name = 'G4G_' + categoryUrl.title() + '.pdf' |
| 55 | + print("Generating PDF " + pdf_file_name) |
| 56 | + html_to_pdf_command = 'wkhtmltopdf ' + html_file_name + ' ' + pdf_file_name |
23 | 57 | system(html_to_pdf_command) |
24 | 58 |
|
25 | 59 |
|
26 | | -# Selecting links which are in the category page |
27 | | -links = [a.attrs.get('href') for a in soup.select('article li a')] |
28 | | -# Removing links for the categories with anchor on same page |
29 | | -links = [link for link in links if not link.startswith('#')] |
| 60 | +def scrape_category(categoryUrl): |
| 61 | + try: |
| 62 | + soup = BeautifulSoup(requests.get(BASE_URL + categoryUrl).text) |
| 63 | + except ConnectionError: |
| 64 | + print("Couldn't connect to Internet! Please check your connection & Try again.") |
| 65 | + exit(1) |
| 66 | + # Selecting links which are in the category page |
| 67 | + links = [a.attrs.get('href') for a in soup.select('article li a')] |
| 68 | + # Removing links for the categories with anchor on same page |
| 69 | + links = [link for link in links if not link.startswith('#')] |
30 | 70 |
|
31 | | -print("Found: " + str(len(links)) + " links") |
32 | | -i = 1 |
| 71 | + print("Found: " + str(len(links)) + " links") |
| 72 | + i = 1 |
33 | 73 |
|
34 | | -for link in links: |
35 | | - try: |
36 | | - print("Scraping link no: " + str(i) + " Link: " + link ) |
37 | | - link_soup = BeautifulSoup(requests.get(link).text) |
38 | | - article = link_soup.find('article') |
39 | | - articles.append(article.encode('UTF-8')) |
40 | | - i = i + 1 |
41 | | - except KeyboardInterrupt: |
42 | | - break |
| 74 | + # Traverse each link to find article and save it. |
| 75 | + for link in links: |
| 76 | + try: |
| 77 | + if(i % 11 == 0): |
| 78 | + sleep(5) # Sleep for 5 seconds after getting every 10th link |
| 79 | + print("Scraping link no: " + str(i) + " Link: " + link ) |
| 80 | + i = i + 1 |
| 81 | + link_soup = BeautifulSoup(requests.get(link).text) |
| 82 | + # Remove the space occupied by Google Ads (Drop script & ins node) |
| 83 | + [script.extract() for script in link_soup(["script", "ins"])] |
| 84 | + article = link_soup.find('article') |
| 85 | + # Now add this article to list of all articles |
| 86 | + articles.append(article.encode('UTF-8')) |
| 87 | + # Sometimes hanging. So Ctrl ^ C, and try the next link. |
| 88 | + # Find out the reason & improve this. |
| 89 | + except KeyboardInterrupt: |
| 90 | + continue |
| 91 | + except ConnectionError: |
| 92 | + print("Internet disconnected! Please check your connection & Try again.") |
| 93 | + if articles: |
| 94 | + print("Making PDF of links scraped till now.") |
| 95 | + break |
| 96 | + else: |
| 97 | + exit(1) |
43 | 98 |
|
44 | 99 |
|
45 | | -print_articles_to_pdf() |
| 100 | +if __name__ == '__main__': |
| 101 | + display_menu() |
| 102 | + categoryUrl = get_category_choice() |
| 103 | + scrape_category(categoryUrl) |
| 104 | + save_articles_as_html_and_pdf() |
0 commit comments