diff --git a/tasks.py b/tasks.py index 473adf5..ace6357 100644 --- a/tasks.py +++ b/tasks.py @@ -8,14 +8,9 @@ from celeryapp import app -USER_AGENT = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' - - @app.task def song_links(): - page = requests.get( - 'http://www.songlyrics.com/taylor-swift-lyrics/', - headers={'user-agent': USER_AGENT}) + page = requests.get('http://www.songlyrics.com/taylor-swift-lyrics/') parsed_content = BeautifulSoup(page.content, 'html.parser') links = parsed_content.find('div', {'id': 'colone-container'}) links = links.find('table', {'class': 'tracklist'}) @@ -33,8 +28,7 @@ def song_links(): @app.task def song_lyrics(url, title): - page = requests.get(url, headers={'user-agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'}) - # have to mimic google bot here + page = requests.get(url) parsed_content = BeautifulSoup(page.content, 'html.parser') lyrics = parsed_content.find('p', {'id':'songLyricsDiv'}).text filename = os.path.join(OUTPUT_DIR, '{}.txt'.format(title))