From 36da52e4d1e4782308f5e5c56a8238379e223675 Mon Sep 17 00:00:00 2001
From: Julien Deswaef <juego@requiem4tv.com>
Date: Mon, 10 Oct 2016 23:09:57 -0400
Subject: [PATCH 1/9] added an RSS parser for lesoir.be

---
 nytdiff.py | 145 +++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 135 insertions(+), 10 deletions(-)

diff --git a/nytdiff.py b/nytdiff.py
index e5fd615735..7288b00623 100644
--- a/nytdiff.py
+++ b/nytdiff.py
@@ -18,7 +18,9 @@
 from simplediff import html_diff
 from selenium import webdriver
 
-TIMEZONE = 'America/Buenos_Aires'
+import feedparser
+
+TIMEZONE = 'Europe/Brussels'
 LOCAL_TZ = timezone(TIMEZONE)
 MAX_RETRIES = 10
 RETRY_DELAY = 3
@@ -141,6 +143,8 @@ def tweet(self, text, article_id, url, column='id'):
         logging.info('Article id: %s', article_id)
         reply_to = self.get_prev_tweet(article_id, column)
         if reply_to is None:
+
+
             logging.info('Tweeting url: %s', url)
             tweet = self.tweet_text(url)
             reply_to = tweet.id
@@ -302,7 +306,7 @@ def store_data(self, data):
                                        ORDER BY version DESC \
                                        LIMIT 1' % (data['article_id']))
                 for row in result:
-                    data['version'] = row['version'] + 1
+                    data['version'] = row['version']
                     self.versions_table.insert(data)
                     url = data['url']
                     if row['url'] != data['url']:
@@ -363,6 +367,125 @@ def parse_pages(self):
         if loop:
             self.remove_old('article_id')
 
+class RSSParser(BaseParser):
+    def __init__(self, api, rss_url):
+        BaseParser.__init__(self, api)
+        self.urls = [rss_url]
+        self.articles_table = self.db['rss_ids']
+        self.versions_table = self.db['rss_versions']
+
+    def entry_to_dict(self, article):
+        article_dict = dict()
+        article_dict['article_id'] = article.id.split(' ')[0]
+        article_dict['url'] = article.link
+        article_dict['title'] = article.title
+        article_dict['abstract'] = self.strip_html(article.description)
+        article_dict['author'] = article.author
+        article_dict['illustration'] = article.media_content[0]['url']
+        article_dict['illustartion_size'] = article.media_content[0]['filesize']
+        od = collections.OrderedDict(sorted(article_dict.items()))
+        article_dict['hash'] = hashlib.sha224(
+            repr(od.items()).encode('utf-8')).hexdigest()
+        article_dict['date_time'] = datetime.now(LOCAL_TZ)
+        return article_dict
+
+    def store_data(self, data):
+        if self.articles_table.find_one(
+                article_id=data['article_id']) is None:  # New
+            article = {
+                'article_id': data['article_id'],
+                'add_dt': data['date_time'],
+                'status': 'home',
+                'tweet_id': None
+            }
+            self.articles_table.insert(article)
+            logging.info('New article tracked: %s', data['url'])
+            data['version'] = 1
+            self.versions_table.insert(data)
+        else:
+            # re insert
+            if self.articles_table.find_one(article_id=data['article_id'],
+                                            status='removed') is not None:
+                article = {
+                    'article_id': data['article_id'],
+                    'add_dt': data['date_time'],
+                }
+
+            count = self.versions_table.count(
+                self.versions_table.table.columns.article_id == data[
+                    'article_id'],
+                hash=data['hash'])
+            if count == 1:  # Existing
+                pass
+            else:  # Changed
+                result = self.db.query('SELECT * \
+                                       FROM rss_versions\
+                                       WHERE article_id = "%s" \
+                                       ORDER BY version DESC \
+                                       LIMIT 1' % (data['article_id']))
+                for row in result:
+                    data['version'] = row['version']
+                    self.versions_table.insert(data)
+                    url = data['url']
+                    if row['url'] != data['url']:
+                        if self.show_diff(row['url'], data['url']):
+                            tweet_text = "Modification d'URL"
+                            logging.info(tweet_text)
+                            # self.tweet(tweet_text, data['article_id'], url,
+                            #            'article_id')
+                    if row['title'] != data['title']:
+                        if self.show_diff(row['title'], data['title']):
+                            tweet_text = "Modification du Titre"
+                            logging.info(tweet_text)
+                            # self.tweet(tweet_text, data['article_id'], url,
+                            #            'article_id')
+                    if row['abstract'] != data['abstract']:
+                        if self.show_diff(row['abstract'], data['abstract']):
+                            tweet_text = "Modification de la Description"
+                            logging.info(tweet_text)
+                            # self.tweet(tweet_text, data['article_id'], url,
+                            #            'article_id')
+                    if row['author'] != data['author']:
+                        if self.show_diff(row['author'], data['author']):
+                            tweet_text = "Modification de l'auteur"
+                            logging.info(tweet_text)
+                            # self.tweet(tweet_text, data['article_id'], url,
+                            #            'article_id')
+                    if row['illustration'] != data['illustration'] or row['illustration_size'] != data['illustration_size']:
+                        if self.show_diff(row['illustration_size'], data['illustration_size']):
+                            tweet_text = "Modification de l'illustration"
+                            logging.info(tweet_text)
+                            # self.tweet(tweet_text, data['article_id'], url,
+                            #            'article_id')
+
+    def loop_entries(self, entries):
+        if len(entries) == 0:
+            return False
+        for article in entries:
+            try:
+                article_dict = self.entry_to_dict(article)
+                if article_dict is not None:
+                    self.store_data(article_dict)
+                    self.current_ids.add(article_dict['article_id'])
+            except BaseException as e:
+                logging.exception('Problem looping RSS: %s', article)
+                print ('Exception: {}'.format(str(e)))
+                print('***************')
+                print(article)
+                print('***************')
+                return False
+        return True
+
+    def parse_rss(self):
+        r = feedparser.parse(self.urls[0])
+        if r is None:
+            logging.warning('Empty response RSS')
+            return
+        else:
+            logging.info('Parsing %s', r.feed.title)
+        loop = self.loop_entries(r.entries)
+        if loop:
+            self.remove_old('article_id')
 
 def main():
     # logging
@@ -380,17 +503,19 @@ def main():
     auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
     auth.secure = True
     auth.set_access_token(access_token, access_token_secret)
-    nyt_api = tweepy.API(auth)
-    logging.debug('NYT Twitter API configured')
+    twitter_api = tweepy.API(auth)
+    logging.debug('Twitter API configured')
 
     try:
-        logging.debug('Starting NYT')
-        nyt_api_key = os.environ['NYT_API_KEY']
-        nyt = NYTParser(nyt_api, nyt_api_key)
-        nyt.parse_pages()
-        logging.debug('Finished NYT')
+        logging.debug('Starting RSS')
+        #nyt_api_key = os.environ['NYT_API_KEY']
+        #nyt = NYTParser(nyt_api, nyt_api_key)
+        rss_url = os.environ['RSS_URL']
+        rss = RSSParser(twitter_api, rss_url)
+        rss.parse_rss()
+        logging.debug('Finished RSS')
     except:
-        logging.exception('NYT')
+        logging.exception('RSS')
 
     logging.info('Finished script')
 

From f8f7d74fdd0ea74970da498c549c10512f60898a Mon Sep 17 00:00:00 2001
From: Julien Deswaef <juego@requiem4tv.com>
Date: Mon, 10 Oct 2016 23:18:49 -0400
Subject: [PATCH 2/9] added feedparser in the requirements

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 667343ad95..cf889d3892 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 alembic==0.8.7
 bleach==1.4.3
 dataset==0.6.4
+feedparser==5.2.1
 html5lib==0.9999999
 Mako==1.0.4
 MarkupSafe==0.23

From 1163ff021be9476c635e5d9a140f0f7e1662cfd8 Mon Sep 17 00:00:00 2001
From: Julien Deswaef <juego@requiem4tv.com>
Date: Tue, 11 Oct 2016 10:19:54 -0400
Subject: [PATCH 3/9] Improved preventing tweets when TESTING

---
 nytdiff.py | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/nytdiff.py b/nytdiff.py
index 7288b00623..b76bd128f7 100644
--- a/nytdiff.py
+++ b/nytdiff.py
@@ -126,6 +126,7 @@ def tweet_with_media(self, text, images, reply_to=None):
     def tweet_text(self, text):
         if TESTING:
             print (text)
+            return True
         try:
             tweet_id = self.api.update_status(status=text)
         except:
@@ -143,15 +144,17 @@ def tweet(self, text, article_id, url, column='id'):
         logging.info('Article id: %s', article_id)
         reply_to = self.get_prev_tweet(article_id, column)
         if reply_to is None:
-
-
             logging.info('Tweeting url: %s', url)
             tweet = self.tweet_text(url)
-            reply_to = tweet.id
+            reply_to = tweet.id if not TESTING else 'test_tweet_id'
         logging.info('Replying to: %s', reply_to)
         tweet = self.tweet_with_media(text, images, reply_to)
-        logging.info('Id to store: %s', tweet.id)
-        self.update_tweet_db(article_id, tweet.id, column)
+        if TESTING :
+            tweet_id = 'test_tweet_id'
+        else:
+            tweet_id = tweet.id
+        logging.info('Id to store: %s', tweet_id)
+        self.update_tweet_db(article_id, tweet_id, column)
         return
 
     def get_page(self, url, header=None, payload=None):
@@ -382,7 +385,7 @@ def entry_to_dict(self, article):
         article_dict['abstract'] = self.strip_html(article.description)
         article_dict['author'] = article.author
         article_dict['illustration'] = article.media_content[0]['url']
-        article_dict['illustartion_size'] = article.media_content[0]['filesize']
+        # article_dict['illustartion_size'] = article.media_content[0]['filesize']
         od = collections.OrderedDict(sorted(article_dict.items()))
         article_dict['hash'] = hashlib.sha224(
             repr(od.items()).encode('utf-8')).hexdigest()
@@ -430,33 +433,28 @@ def store_data(self, data):
                     if row['url'] != data['url']:
                         if self.show_diff(row['url'], data['url']):
                             tweet_text = "Modification d'URL"
-                            logging.info(tweet_text)
-                            # self.tweet(tweet_text, data['article_id'], url,
-                            #            'article_id')
+                            self.tweet(tweet_text, data['article_id'], url,
+                                       'article_id')
                     if row['title'] != data['title']:
                         if self.show_diff(row['title'], data['title']):
                             tweet_text = "Modification du Titre"
-                            logging.info(tweet_text)
-                            # self.tweet(tweet_text, data['article_id'], url,
-                            #            'article_id')
+                            self.tweet(tweet_text, data['article_id'], url,
+                                       'article_id')
                     if row['abstract'] != data['abstract']:
                         if self.show_diff(row['abstract'], data['abstract']):
                             tweet_text = "Modification de la Description"
-                            logging.info(tweet_text)
-                            # self.tweet(tweet_text, data['article_id'], url,
-                            #            'article_id')
+                            self.tweet(tweet_text, data['article_id'], url,
+                                       'article_id')
                     if row['author'] != data['author']:
                         if self.show_diff(row['author'], data['author']):
                             tweet_text = "Modification de l'auteur"
-                            logging.info(tweet_text)
-                            # self.tweet(tweet_text, data['article_id'], url,
-                            #            'article_id')
-                    if row['illustration'] != data['illustration'] or row['illustration_size'] != data['illustration_size']:
-                        if self.show_diff(row['illustration_size'], data['illustration_size']):
+                            self.tweet(tweet_text, data['article_id'], url,
+                                       'article_id')
+                    if row['illustration'] != data['illustration']:
+                        if self.show_diff(row['illustration'], data['illustration']):
                             tweet_text = "Modification de l'illustration"
-                            logging.info(tweet_text)
-                            # self.tweet(tweet_text, data['article_id'], url,
-                            #            'article_id')
+                            self.tweet(tweet_text, data['article_id'], url,
+                                       'article_id')
 
     def loop_entries(self, entries):
         if len(entries) == 0:

From aeba91bd3202a0d2012aba69b9a30b24f7ac5130 Mon Sep 17 00:00:00 2001
From: Julien Deswaef <juego@requiem4tv.com>
Date: Tue, 11 Oct 2016 10:25:10 -0400
Subject: [PATCH 4/9] Removing tracking illustration changes. Just the url
 changes, not the visual content.

---
 nytdiff.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/nytdiff.py b/nytdiff.py
index b76bd128f7..f9849c5765 100644
--- a/nytdiff.py
+++ b/nytdiff.py
@@ -384,7 +384,7 @@ def entry_to_dict(self, article):
         article_dict['title'] = article.title
         article_dict['abstract'] = self.strip_html(article.description)
         article_dict['author'] = article.author
-        article_dict['illustration'] = article.media_content[0]['url']
+        # article_dict['illustration'] = article.media_content[0]['url']
         # article_dict['illustartion_size'] = article.media_content[0]['filesize']
         od = collections.OrderedDict(sorted(article_dict.items()))
         article_dict['hash'] = hashlib.sha224(
@@ -450,11 +450,6 @@ def store_data(self, data):
                             tweet_text = "Modification de l'auteur"
                             self.tweet(tweet_text, data['article_id'], url,
                                        'article_id')
-                    if row['illustration'] != data['illustration']:
-                        if self.show_diff(row['illustration'], data['illustration']):
-                            tweet_text = "Modification de l'illustration"
-                            self.tweet(tweet_text, data['article_id'], url,
-                                       'article_id')
 
     def loop_entries(self, entries):
         if len(entries) == 0:

From 20cc5cd5a8bdf6d76bac7a823f51e8c29b884185 Mon Sep 17 00:00:00 2001
From: Julien Deswaef <juego@requiem4tv.com>
Date: Tue, 11 Oct 2016 11:02:19 -0400
Subject: [PATCH 5/9] Allows url to be displayed correctly. Without this they
 will be truncated

---
 css/styles.css | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/css/styles.css b/css/styles.css
index ec24f9e2e6..ed5ec3e8ab 100755
--- a/css/styles.css
+++ b/css/styles.css
@@ -1,12 +1,12 @@
-@font-face { 
-    font-family: Merriweather; 
+@font-face {
+    font-family: Merriweather;
     font-style: normal;
     font-weight: normal;
-    src: url('../fonts/Merriweather-Regular.ttf') format("truetype"); 
-} 
+    src: url('../fonts/Merriweather-Regular.ttf') format("truetype");
+}
 
-body { 
-    background: lightgray url('../img/paper_fibers.png') repeat; 
+body {
+    background: lightgray url('../img/paper_fibers.png') repeat;
     font-family: Merriweather;
     font-size: 16px;
 }
@@ -17,6 +17,7 @@ p {
     margin-top: 1em;
     margin-bottom: 1em;
     font-weight: normal;
+    word-wrap: break-word;
 }
 
 del {

From 7996d8e7f9b674062da3fa8fb8c68ead84f17799 Mon Sep 17 00:00:00 2001
From: Julien Deswaef <juego@requiem4tv.com>
Date: Tue, 11 Oct 2016 11:48:47 -0400
Subject: [PATCH 6/9] changed setting tmp twitter id when TESTING

---
 nytdiff.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/nytdiff.py b/nytdiff.py
index f9849c5765..1062aa1f93 100644
--- a/nytdiff.py
+++ b/nytdiff.py
@@ -146,11 +146,13 @@ def tweet(self, text, article_id, url, column='id'):
         if reply_to is None:
             logging.info('Tweeting url: %s', url)
             tweet = self.tweet_text(url)
-            reply_to = tweet.id if not TESTING else 'test_tweet_id'
+            # if TESTING, give a random id based on time
+            reply_to = tweet.id if not TESTING else time.time()
         logging.info('Replying to: %s', reply_to)
         tweet = self.tweet_with_media(text, images, reply_to)
         if TESTING :
-            tweet_id = 'test_tweet_id'
+            # if TESTING, give a random id based on time
+            tweet_id = time.time()
         else:
             tweet_id = tweet.id
         logging.info('Id to store: %s', tweet_id)

From 97b9f66678898118ff320c6e8421903e58a12b6f Mon Sep 17 00:00:00 2001
From: Julien Deswaef <juego@requiem4tv.com>
Date: Tue, 11 Oct 2016 11:49:18 -0400
Subject: [PATCH 7/9] Added placeholder for RSS link

---
 run_diff.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/run_diff.sh b/run_diff.sh
index 1ab5bf0082..a3fa18aac7 100755
--- a/run_diff.sh
+++ b/run_diff.sh
@@ -7,6 +7,7 @@ export NYT_TWITTER_ACCESS_TOKEN=""
 export NYT_TWITTER_ACCESS_TOKEN_SECRET=""
 
 export NYT_API_KEY=""
+export RSS_URL=""
 
 export PHANTOMJS_PATH="./"
 

From fd16d59d64c4fba58630a26981b68fabd4416a0d Mon Sep 17 00:00:00 2001
From: Julien Deswaef <juego@requiem4tv.com>
Date: Tue, 11 Oct 2016 12:00:01 -0400
Subject: [PATCH 8/9] added explanation and credits

---
 README.md | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index d7b81daa2e..982f494a89 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,21 @@
-# NYTdiff
+# NYTdiff+
 
-Code for the twitter bot [@nyt_diff](https://twitter.com/nyt_diff).
+Based on @j-e-d's code for the twitter bot [@nyt_diff](https://twitter.com/nyt_diff).  
+RSS feed fetching added for @xuv's twitter bot [@lesoir_diff](https://twitter.com/lesoir_diff)
 
-The [phantomjs](http://phantomjs.org/) binary needs to be installed and the path updated in the run_diff.sh file.
+[Twitter keys](https://dev.twitter.com/) are needed.  
+[NYT API](http://developers.nytimes.com/) key for the "Top Stories V2" are needed for The New York Times.  
+An RSS Url is needed for [Le Soir](http://lesoir.be) or any other news website.
 
-[Twitter keys](https://dev.twitter.com/) and the [NYT API](http://developers.nytimes.com/) key for the "Top Stories V2" service are needed, values of this keys need to be entered in the run_diff.sh file.
+Installation
+------------
++ The [phantomjs](http://phantomjs.org/) binary needs to be installed and the path updated in the run_diff.sh file.
++ `pip install -r requirements.txt`
 
-Font: [Merriweather](https://fonts.google.com/specimen/Merriweather). Background pattern: [Paper Fibers](http://subtlepatterns.com/paper-fibers/).
 
+Credits
+-------
++ Original script and idea: @j-e-d Juan E.D. http://unahormiga.com/
++ RSS fetching: @xuv Julien Deswaef http://xuv.be
++ Font: [Merriweather](https://fonts.google.com/specimen/Merriweather)
++ Background pattern: [Paper Fibers](http://subtlepatterns.com/paper-fibers/).

From 8569a3bd26b4b176ed43152aa59384273e468bc5 Mon Sep 17 00:00:00 2001
From: Julien Deswaef <juego@requiem4tv.com>
Date: Tue, 11 Oct 2016 17:43:05 -0400
Subject: [PATCH 9/9] bug on the version number. Slipped incrementation now
 back on

---
 nytdiff.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/nytdiff.py b/nytdiff.py
index 1062aa1f93..c6c8b224af 100644
--- a/nytdiff.py
+++ b/nytdiff.py
@@ -429,14 +429,9 @@ def store_data(self, data):
                                        ORDER BY version DESC \
                                        LIMIT 1' % (data['article_id']))
                 for row in result:
-                    data['version'] = row['version']
+                    data['version'] = row['version'] +1
                     self.versions_table.insert(data)
                     url = data['url']
-                    if row['url'] != data['url']:
-                        if self.show_diff(row['url'], data['url']):
-                            tweet_text = "Modification d'URL"
-                            self.tweet(tweet_text, data['article_id'], url,
-                                       'article_id')
                     if row['title'] != data['title']:
                         if self.show_diff(row['title'], data['title']):
                             tweet_text = "Modification du Titre"
@@ -452,6 +447,11 @@ def store_data(self, data):
                             tweet_text = "Modification de l'auteur"
                             self.tweet(tweet_text, data['article_id'], url,
                                        'article_id')
+                    if row['url'] != data['url']:
+                        if self.show_diff(row['url'], data['url']):
+                            tweet_text = "Modification d'URL"
+                            self.tweet(tweet_text, data['article_id'], url,
+                                       'article_id')
 
     def loop_entries(self, entries):
         if len(entries) == 0: