From 3f9dece256d316090b035233c4e3bb2dc37208bc Mon Sep 17 00:00:00 2001 From: Rebecca Conley Date: Thu, 15 Sep 2016 19:54:04 -0400 Subject: [PATCH 1/2] removed unnecessary user agent headers --- tasks.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tasks.py b/tasks.py index 473adf5..4adf1e2 100644 --- a/tasks.py +++ b/tasks.py @@ -8,14 +8,9 @@ from celeryapp import app -USER_AGENT = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' - - @app.task def song_links(): - page = requests.get( - 'http://www.songlyrics.com/taylor-swift-lyrics/', - headers={'user-agent': USER_AGENT}) + page = requests.get('http://www.songlyrics.com/taylor-swift-lyrics/') parsed_content = BeautifulSoup(page.content, 'html.parser') links = parsed_content.find('div', {'id': 'colone-container'}) links = links.find('table', {'class': 'tracklist'}) @@ -33,7 +28,7 @@ def song_links(): @app.task def song_lyrics(url, title): - page = requests.get(url, headers={'user-agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'}) + page = requests.get(url) # have to mimic google bot here parsed_content = BeautifulSoup(page.content, 'html.parser') lyrics = parsed_content.find('p', {'id':'songLyricsDiv'}).text From ae5de476b0edf97e3f00a0cf73afc69979cc53d0 Mon Sep 17 00:00:00 2001 From: Rebecca Conley Date: Thu, 15 Sep 2016 20:23:37 -0400 Subject: [PATCH 2/2] remove comment --- tasks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tasks.py b/tasks.py index 4adf1e2..ace6357 100644 --- a/tasks.py +++ b/tasks.py @@ -29,7 +29,6 @@ def song_links(): @app.task def song_lyrics(url, title): page = requests.get(url) - # have to mimic google bot here parsed_content = BeautifulSoup(page.content, 'html.parser') lyrics = parsed_content.find('p', {'id':'songLyricsDiv'}).text filename = os.path.join(OUTPUT_DIR, '{}.txt'.format(title))