From 36da52e4d1e4782308f5e5c56a8238379e223675 Mon Sep 17 00:00:00 2001 From: Julien Deswaef Date: Mon, 10 Oct 2016 23:09:57 -0400 Subject: [PATCH 1/9] added an RSS parser for lesoir.be --- nytdiff.py | 145 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 135 insertions(+), 10 deletions(-) diff --git a/nytdiff.py b/nytdiff.py index e5fd615735..7288b00623 100644 --- a/nytdiff.py +++ b/nytdiff.py @@ -18,7 +18,9 @@ from simplediff import html_diff from selenium import webdriver -TIMEZONE = 'America/Buenos_Aires' +import feedparser + +TIMEZONE = 'Europe/Brussels' LOCAL_TZ = timezone(TIMEZONE) MAX_RETRIES = 10 RETRY_DELAY = 3 @@ -141,6 +143,8 @@ def tweet(self, text, article_id, url, column='id'): logging.info('Article id: %s', article_id) reply_to = self.get_prev_tweet(article_id, column) if reply_to is None: + + logging.info('Tweeting url: %s', url) tweet = self.tweet_text(url) reply_to = tweet.id @@ -302,7 +306,7 @@ def store_data(self, data): ORDER BY version DESC \ LIMIT 1' % (data['article_id'])) for row in result: - data['version'] = row['version'] + 1 + data['version'] = row['version'] self.versions_table.insert(data) url = data['url'] if row['url'] != data['url']: @@ -363,6 +367,125 @@ def parse_pages(self): if loop: self.remove_old('article_id') +class RSSParser(BaseParser): + def __init__(self, api, rss_url): + BaseParser.__init__(self, api) + self.urls = [rss_url] + self.articles_table = self.db['rss_ids'] + self.versions_table = self.db['rss_versions'] + + def entry_to_dict(self, article): + article_dict = dict() + article_dict['article_id'] = article.id.split(' ')[0] + article_dict['url'] = article.link + article_dict['title'] = article.title + article_dict['abstract'] = self.strip_html(article.description) + article_dict['author'] = article.author + article_dict['illustration'] = article.media_content[0]['url'] + article_dict['illustartion_size'] = article.media_content[0]['filesize'] + od = collections.OrderedDict(sorted(article_dict.items())) + article_dict['hash'] = hashlib.sha224( + repr(od.items()).encode('utf-8')).hexdigest() + article_dict['date_time'] = datetime.now(LOCAL_TZ) + return article_dict + + def store_data(self, data): + if self.articles_table.find_one( + article_id=data['article_id']) is None: # New + article = { + 'article_id': data['article_id'], + 'add_dt': data['date_time'], + 'status': 'home', + 'tweet_id': None + } + self.articles_table.insert(article) + logging.info('New article tracked: %s', data['url']) + data['version'] = 1 + self.versions_table.insert(data) + else: + # re insert + if self.articles_table.find_one(article_id=data['article_id'], + status='removed') is not None: + article = { + 'article_id': data['article_id'], + 'add_dt': data['date_time'], + } + + count = self.versions_table.count( + self.versions_table.table.columns.article_id == data[ + 'article_id'], + hash=data['hash']) + if count == 1: # Existing + pass + else: # Changed + result = self.db.query('SELECT * \ + FROM rss_versions\ + WHERE article_id = "%s" \ + ORDER BY version DESC \ + LIMIT 1' % (data['article_id'])) + for row in result: + data['version'] = row['version'] + self.versions_table.insert(data) + url = data['url'] + if row['url'] != data['url']: + if self.show_diff(row['url'], data['url']): + tweet_text = "Modification d'URL" + logging.info(tweet_text) + # self.tweet(tweet_text, data['article_id'], url, + # 'article_id') + if row['title'] != data['title']: + if self.show_diff(row['title'], data['title']): + tweet_text = "Modification du Titre" + logging.info(tweet_text) + # self.tweet(tweet_text, data['article_id'], url, + # 'article_id') + if row['abstract'] != data['abstract']: + if self.show_diff(row['abstract'], data['abstract']): + tweet_text = "Modification de la Description" + logging.info(tweet_text) + # self.tweet(tweet_text, data['article_id'], url, + # 'article_id') + if row['author'] != data['author']: + if self.show_diff(row['author'], data['author']): + tweet_text = "Modification de l'auteur" + logging.info(tweet_text) + # self.tweet(tweet_text, data['article_id'], url, + # 'article_id') + if row['illustration'] != data['illustration'] or row['illustration_size'] != data['illustration_size']: + if self.show_diff(row['illustration_size'], data['illustration_size']): + tweet_text = "Modification de l'illustration" + logging.info(tweet_text) + # self.tweet(tweet_text, data['article_id'], url, + # 'article_id') + + def loop_entries(self, entries): + if len(entries) == 0: + return False + for article in entries: + try: + article_dict = self.entry_to_dict(article) + if article_dict is not None: + self.store_data(article_dict) + self.current_ids.add(article_dict['article_id']) + except BaseException as e: + logging.exception('Problem looping RSS: %s', article) + print ('Exception: {}'.format(str(e))) + print('***************') + print(article) + print('***************') + return False + return True + + def parse_rss(self): + r = feedparser.parse(self.urls[0]) + if r is None: + logging.warning('Empty response RSS') + return + else: + logging.info('Parsing %s', r.feed.title) + loop = self.loop_entries(r.entries) + if loop: + self.remove_old('article_id') def main(): # logging @@ -380,17 +503,19 @@ def main(): auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.secure = True auth.set_access_token(access_token, access_token_secret) - nyt_api = tweepy.API(auth) - logging.debug('NYT Twitter API configured') + twitter_api = tweepy.API(auth) + logging.debug('Twitter API configured') try: - logging.debug('Starting NYT') - nyt_api_key = os.environ['NYT_API_KEY'] - nyt = NYTParser(nyt_api, nyt_api_key) - nyt.parse_pages() - logging.debug('Finished NYT') + logging.debug('Starting RSS') + #nyt_api_key = os.environ['NYT_API_KEY'] + #nyt = NYTParser(nyt_api, nyt_api_key) + rss_url = os.environ['RSS_URL'] + rss = RSSParser(twitter_api, rss_url) + rss.parse_rss() + logging.debug('Finished RSS') except: - logging.exception('NYT') + logging.exception('RSS') logging.info('Finished script') From f8f7d74fdd0ea74970da498c549c10512f60898a Mon Sep 17 00:00:00 2001 From: Julien Deswaef Date: Mon, 10 Oct 2016 23:18:49 -0400 Subject: [PATCH 2/9] added feedparser in the requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 667343ad95..cf889d3892 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ alembic==0.8.7 bleach==1.4.3 dataset==0.6.4 +feedparser==5.2.1 html5lib==0.9999999 Mako==1.0.4 MarkupSafe==0.23 From 1163ff021be9476c635e5d9a140f0f7e1662cfd8 Mon Sep 17 00:00:00 2001 From: Julien Deswaef Date: Tue, 11 Oct 2016 10:19:54 -0400 Subject: [PATCH 3/9] Improved preventing tweets when TESTING --- nytdiff.py | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/nytdiff.py b/nytdiff.py index 7288b00623..b76bd128f7 100644 --- a/nytdiff.py +++ b/nytdiff.py @@ -126,6 +126,7 @@ def tweet_with_media(self, text, images, reply_to=None): def tweet_text(self, text): if TESTING: print (text) + return True try: tweet_id = self.api.update_status(status=text) except: @@ -143,15 +144,17 @@ def tweet(self, text, article_id, url, column='id'): logging.info('Article id: %s', article_id) reply_to = self.get_prev_tweet(article_id, column) if reply_to is None: - - logging.info('Tweeting url: %s', url) tweet = self.tweet_text(url) - reply_to = tweet.id + reply_to = tweet.id if not TESTING else 'test_tweet_id' logging.info('Replying to: %s', reply_to) tweet = self.tweet_with_media(text, images, reply_to) - logging.info('Id to store: %s', tweet.id) - self.update_tweet_db(article_id, tweet.id, column) + if TESTING : + tweet_id = 'test_tweet_id' + else: + tweet_id = tweet.id + logging.info('Id to store: %s', tweet_id) + self.update_tweet_db(article_id, tweet_id, column) return def get_page(self, url, header=None, payload=None): @@ -382,7 +385,7 @@ def entry_to_dict(self, article): article_dict['abstract'] = self.strip_html(article.description) article_dict['author'] = article.author article_dict['illustration'] = article.media_content[0]['url'] - article_dict['illustartion_size'] = article.media_content[0]['filesize'] + # article_dict['illustartion_size'] = article.media_content[0]['filesize'] od = collections.OrderedDict(sorted(article_dict.items())) article_dict['hash'] = hashlib.sha224( repr(od.items()).encode('utf-8')).hexdigest() @@ -430,33 +433,28 @@ def store_data(self, data): if row['url'] != data['url']: if self.show_diff(row['url'], data['url']): tweet_text = "Modification d'URL" - logging.info(tweet_text) - # self.tweet(tweet_text, data['article_id'], url, - # 'article_id') + self.tweet(tweet_text, data['article_id'], url, + 'article_id') if row['title'] != data['title']: if self.show_diff(row['title'], data['title']): tweet_text = "Modification du Titre" - logging.info(tweet_text) - # self.tweet(tweet_text, data['article_id'], url, - # 'article_id') + self.tweet(tweet_text, data['article_id'], url, + 'article_id') if row['abstract'] != data['abstract']: if self.show_diff(row['abstract'], data['abstract']): tweet_text = "Modification de la Description" - logging.info(tweet_text) - # self.tweet(tweet_text, data['article_id'], url, - # 'article_id') + self.tweet(tweet_text, data['article_id'], url, + 'article_id') if row['author'] != data['author']: if self.show_diff(row['author'], data['author']): tweet_text = "Modification de l'auteur" - logging.info(tweet_text) - # self.tweet(tweet_text, data['article_id'], url, - # 'article_id') - if row['illustration'] != data['illustration'] or row['illustration_size'] != data['illustration_size']: - if self.show_diff(row['illustration_size'], data['illustration_size']): + self.tweet(tweet_text, data['article_id'], url, + 'article_id') + if row['illustration'] != data['illustration']: + if self.show_diff(row['illustration'], data['illustration']): tweet_text = "Modification de l'illustration" - logging.info(tweet_text) - # self.tweet(tweet_text, data['article_id'], url, - # 'article_id') + self.tweet(tweet_text, data['article_id'], url, + 'article_id') def loop_entries(self, entries): if len(entries) == 0: From aeba91bd3202a0d2012aba69b9a30b24f7ac5130 Mon Sep 17 00:00:00 2001 From: Julien Deswaef Date: Tue, 11 Oct 2016 10:25:10 -0400 Subject: [PATCH 4/9] Removing tracking illustration changes. Just the url changes, not the visual content. --- nytdiff.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/nytdiff.py b/nytdiff.py index b76bd128f7..f9849c5765 100644 --- a/nytdiff.py +++ b/nytdiff.py @@ -384,7 +384,7 @@ def entry_to_dict(self, article): article_dict['title'] = article.title article_dict['abstract'] = self.strip_html(article.description) article_dict['author'] = article.author - article_dict['illustration'] = article.media_content[0]['url'] + # article_dict['illustration'] = article.media_content[0]['url'] # article_dict['illustartion_size'] = article.media_content[0]['filesize'] od = collections.OrderedDict(sorted(article_dict.items())) article_dict['hash'] = hashlib.sha224( @@ -450,11 +450,6 @@ def store_data(self, data): tweet_text = "Modification de l'auteur" self.tweet(tweet_text, data['article_id'], url, 'article_id') - if row['illustration'] != data['illustration']: - if self.show_diff(row['illustration'], data['illustration']): - tweet_text = "Modification de l'illustration" - self.tweet(tweet_text, data['article_id'], url, - 'article_id') def loop_entries(self, entries): if len(entries) == 0: From 20cc5cd5a8bdf6d76bac7a823f51e8c29b884185 Mon Sep 17 00:00:00 2001 From: Julien Deswaef Date: Tue, 11 Oct 2016 11:02:19 -0400 Subject: [PATCH 5/9] Allows url to be displayed correctly. Without this they will be truncated --- css/styles.css | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/css/styles.css b/css/styles.css index ec24f9e2e6..ed5ec3e8ab 100755 --- a/css/styles.css +++ b/css/styles.css @@ -1,12 +1,12 @@ -@font-face { - font-family: Merriweather; +@font-face { + font-family: Merriweather; font-style: normal; font-weight: normal; - src: url('../fonts/Merriweather-Regular.ttf') format("truetype"); -} + src: url('../fonts/Merriweather-Regular.ttf') format("truetype"); +} -body { - background: lightgray url('../img/paper_fibers.png') repeat; +body { + background: lightgray url('../img/paper_fibers.png') repeat; font-family: Merriweather; font-size: 16px; } @@ -17,6 +17,7 @@ p { margin-top: 1em; margin-bottom: 1em; font-weight: normal; + word-wrap: break-word; } del { From 7996d8e7f9b674062da3fa8fb8c68ead84f17799 Mon Sep 17 00:00:00 2001 From: Julien Deswaef Date: Tue, 11 Oct 2016 11:48:47 -0400 Subject: [PATCH 6/9] changed setting tmp twitter id when TESTING --- nytdiff.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nytdiff.py b/nytdiff.py index f9849c5765..1062aa1f93 100644 --- a/nytdiff.py +++ b/nytdiff.py @@ -146,11 +146,13 @@ def tweet(self, text, article_id, url, column='id'): if reply_to is None: logging.info('Tweeting url: %s', url) tweet = self.tweet_text(url) - reply_to = tweet.id if not TESTING else 'test_tweet_id' + # if TESTING, give a random id based on time + reply_to = tweet.id if not TESTING else time.time() logging.info('Replying to: %s', reply_to) tweet = self.tweet_with_media(text, images, reply_to) if TESTING : - tweet_id = 'test_tweet_id' + # if TESTING, give a random id based on time + tweet_id = time.time() else: tweet_id = tweet.id logging.info('Id to store: %s', tweet_id) From 97b9f66678898118ff320c6e8421903e58a12b6f Mon Sep 17 00:00:00 2001 From: Julien Deswaef Date: Tue, 11 Oct 2016 11:49:18 -0400 Subject: [PATCH 7/9] Added placeholder for RSS link --- run_diff.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/run_diff.sh b/run_diff.sh index 1ab5bf0082..a3fa18aac7 100755 --- a/run_diff.sh +++ b/run_diff.sh @@ -7,6 +7,7 @@ export NYT_TWITTER_ACCESS_TOKEN="" export NYT_TWITTER_ACCESS_TOKEN_SECRET="" export NYT_API_KEY="" +export RSS_URL="" export PHANTOMJS_PATH="./" From fd16d59d64c4fba58630a26981b68fabd4416a0d Mon Sep 17 00:00:00 2001 From: Julien Deswaef Date: Tue, 11 Oct 2016 12:00:01 -0400 Subject: [PATCH 8/9] added explanation and credits --- README.md | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index d7b81daa2e..982f494a89 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,21 @@ -# NYTdiff +# NYTdiff+ -Code for the twitter bot [@nyt_diff](https://twitter.com/nyt_diff). +Based on @j-e-d's code for the twitter bot [@nyt_diff](https://twitter.com/nyt_diff). +RSS feed fetching added for @xuv's twitter bot [@lesoir_diff](https://twitter.com/lesoir_diff) -The [phantomjs](http://phantomjs.org/) binary needs to be installed and the path updated in the run_diff.sh file. +[Twitter keys](https://dev.twitter.com/) are needed. +[NYT API](http://developers.nytimes.com/) key for the "Top Stories V2" are needed for The New York Times. +An RSS Url is needed for [Le Soir](http://lesoir.be) or any other news website. -[Twitter keys](https://dev.twitter.com/) and the [NYT API](http://developers.nytimes.com/) key for the "Top Stories V2" service are needed, values of this keys need to be entered in the run_diff.sh file. +Installation +------------ ++ The [phantomjs](http://phantomjs.org/) binary needs to be installed and the path updated in the run_diff.sh file. ++ `pip install -r requirements.txt` -Font: [Merriweather](https://fonts.google.com/specimen/Merriweather). Background pattern: [Paper Fibers](http://subtlepatterns.com/paper-fibers/). +Credits +------- ++ Original script and idea: @j-e-d Juan E.D. http://unahormiga.com/ ++ RSS fetching: @xuv Julien Deswaef http://xuv.be ++ Font: [Merriweather](https://fonts.google.com/specimen/Merriweather) ++ Background pattern: [Paper Fibers](http://subtlepatterns.com/paper-fibers/). From 8569a3bd26b4b176ed43152aa59384273e468bc5 Mon Sep 17 00:00:00 2001 From: Julien Deswaef Date: Tue, 11 Oct 2016 17:43:05 -0400 Subject: [PATCH 9/9] bug on the version number. Slipped incrementation now back on --- nytdiff.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nytdiff.py b/nytdiff.py index 1062aa1f93..c6c8b224af 100644 --- a/nytdiff.py +++ b/nytdiff.py @@ -429,14 +429,9 @@ def store_data(self, data): ORDER BY version DESC \ LIMIT 1' % (data['article_id'])) for row in result: - data['version'] = row['version'] + data['version'] = row['version'] +1 self.versions_table.insert(data) url = data['url'] - if row['url'] != data['url']: - if self.show_diff(row['url'], data['url']): - tweet_text = "Modification d'URL" - self.tweet(tweet_text, data['article_id'], url, - 'article_id') if row['title'] != data['title']: if self.show_diff(row['title'], data['title']): tweet_text = "Modification du Titre" @@ -452,6 +447,11 @@ def store_data(self, data): tweet_text = "Modification de l'auteur" self.tweet(tweet_text, data['article_id'], url, 'article_id') + if row['url'] != data['url']: + if self.show_diff(row['url'], data['url']): + tweet_text = "Modification d'URL" + self.tweet(tweet_text, data['article_id'], url, + 'article_id') def loop_entries(self, entries): if len(entries) == 0: