Post RSS to GNU Social

gnusrss.py 20KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. #
  4. # Author:: drymer <drymer [ EN ] autistici.org>
  5. # Copyright:: Copyright (c) 2016, drymer
  6. #
  7. # This program is free software: you can redistribute it and/or modify
  8. # it under the terms of the GNU General Public License as published by
  9. # the Free Software Foundation, either version 2 of the License, or (at
  10. # your option) any later version.
  11. #
  12. # This program is distributed in the hope that it will be useful, but
  13. # WITHOUT ANY WARRANTY; without even the implied warranty of
  14. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. # General Public License for more details.
  16. #
  17. # You should have received a copy of the GNU General Public License
  18. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  19. import argparse
  20. import configparser
  21. import hashlib
  22. import logging
  23. import time
  24. import urllib.parse
  25. import urllib.request
  26. import sqlite3
  27. from html.parser import HTMLParser
  28. from os import listdir, path
  29. from re import findall
  30. from sys import argv, exit
  31. from xml.dom import minidom
  32. import requests
  33. import feedparser
  34. # Logging stuff
  35. logger = logging.getLogger()
  36. handler = logging.StreamHandler()
  37. formatter = logging.Formatter(
  38. '%(asctime)s %(levelname)-8s %(funcName)s %(message)s')
  39. handler.setFormatter(formatter)
  40. logger.addHandler(handler)
  41. class Database:
  42. """Manage the database."""
  43. def __init__(self, database='gnusrss.db'):
  44. """
  45. Connect to the database.
  46. database -- string containig the filepath of the db
  47. """
  48. self.connection = sqlite3.connect(database)
  49. logger.info('Sqlite database connected')
  50. def create_tables(self):
  51. """Create table and columns."""
  52. current = self.connection.cursor()
  53. drop = 'DROP TABLE IF EXISTS items'
  54. create = 'CREATE TABLE items(id INTEGER PRIMARY KEY, feed TEXT, po' + \
  55. 'st TEXT, posted INTEGER, url TEXT, lastbuild TIMESTAMP, ' + \
  56. 'guid TEXT)'
  57. current.execute(drop)
  58. logger.info('Sqlite Query: %s', drop)
  59. current.execute(create)
  60. logger.info('Sqlite Query: %s', create)
  61. def insert_data(self, param):
  62. """
  63. Insert all the article's information to the table.
  64. Keyword arguments:
  65. param -- list containing all the values
  66. """
  67. insert = 'INSERT INTO items(feed, post, posted, url, lastbuild, gu' + \
  68. 'id) VALUES(\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\")' \
  69. % tuple(param)
  70. self.connection.execute(insert)
  71. self.connection.commit()
  72. logger.info('Sqlite Query: %s', insert)
  73. def select(self, param):
  74. """
  75. Return a select.
  76. Keyword arguments:
  77. param -- string containing a sql select
  78. """
  79. current = self.connection.cursor()
  80. current.execute(param)
  81. rows = current.fetchall()
  82. logger.info('Sqlite Query: %s', param)
  83. logger.debug('Sqlite Query Result: %s', rows)
  84. return rows
  85. def close(self):
  86. """Close the database."""
  87. self.connection.close()
  88. logger.info('Sqlite database closed')
  89. class StupidParser(HTMLParser):
  90. """Just a HTML parser."""
  91. def __init__(self):
  92. try:
  93. HTMLParser.__init__(self, convert_charrefs=True)
  94. except:
  95. # python 3.2 support
  96. HTMLParser.__init__(self)
  97. self.data = []
  98. def handle_data(self, data):
  99. self.data.append(data)
  100. def return_value(self):
  101. return ''.join(self.data)
  102. class GNUsrss:
  103. def parse_feed(self, feed, post_format):
  104. """
  105. Request the feed, parse it and return requested values on a list
  106. of lists.
  107. Keyword arguments:
  108. feed -- string containing the url or the filepath of the feed
  109. post_format -- string containing RSS keywords surrounded by {}
  110. Comment:
  111. Here it's saved way more tags that aren't necessary. They're added just
  112. to add more metadata just because it's clearer when viewing the sqlite.
  113. """
  114. article = []
  115. xml = feedparser.parse(feed)
  116. entries_keys = list(xml.entries[0].keys())
  117. feed_keys = list(xml.feed.keys())
  118. # Very ugly way to test existence, but seems to be the only way
  119. if 'published' in entries_keys:
  120. lastbuild = xml.entries[0].published
  121. elif 'published' in feed_keys:
  122. lastbuild = xml.feed.published
  123. elif 'updated' in entries_keys:
  124. lastbuild = xml.entries[0].updated
  125. elif 'updated' in feed_keys:
  126. lastbuild = xml.feed.updated
  127. else:
  128. # Since the feed doesn't have a date, I'll create it
  129. lastbuild = time.strftime("%a, %d %b %Y %H:%M:%S GMT")
  130. if 'link' in feed_keys:
  131. rss_link = xml.feed.link
  132. else:
  133. rss_link = 'http://' + xml.entries[0].link.split('/')[2]
  134. for item in xml['items']:
  135. values = {}
  136. for i in entries_keys:
  137. if i in post_format:
  138. values[i] = item[i]
  139. post = post_format.format(**values)
  140. # Stupid HTML code adding to complete the post to parse it
  141. post = '<html>' + post + '</html>'
  142. parser = StupidParser()
  143. parser.feed(post)
  144. post = parser.return_value()
  145. if 'guid' in entries_keys:
  146. guid = item['guid']
  147. else:
  148. # Since the feed doesn't have a guid, I'll create it
  149. guid = hashlib.sha1(post.encode()).hexdigest()
  150. article.append([rss_link, post, item['link'], lastbuild, guid])
  151. return article
  152. def post(self, article, gs_node, username, password, insecure):
  153. """
  154. Post the articles to GNU Social.
  155. Keyword arguments:
  156. article -- list containing a most of what is necessary on the insert
  157. gs_node -- string containing the url of the GNU Social node
  158. username -- string containing the user of GNU Social
  159. password -- string containing the password of GNU Social
  160. """
  161. msg = article[1].split()
  162. api = (gs_node + '/api/statuses/update.xml')
  163. # Check for twitter images and call post_image if required
  164. for word in msg:
  165. if 'pic.twitter.com/' in word:
  166. image = self.post_image(word, gs_node, username, password,
  167. insecure)
  168. if image is not None:
  169. index = msg.index(word)
  170. msg[index] = image
  171. else:
  172. pass
  173. msg = ' '.join(msg)
  174. post_data = {'status': msg, 'source': 'gnusrss'}
  175. if insecure == 'yes':
  176. req = requests.post(api, auth=(username, password), data=post_data,
  177. verify=False)
  178. else:
  179. req = requests.post(api, auth=(username, password), data=post_data)
  180. logger.info('%s %s %s', req.url, req.status_code, post_data)
  181. logger.debug('%s', req.text)
  182. response = req.status_code
  183. return response
  184. def post_image(self, picture, gs_node, username, password, insecure):
  185. """
  186. Upload a picture to GNU Social hosting and return a string with the
  187. new url.
  188. Keyword arguments:
  189. picture -- string containing the twitter url of a picture
  190. gs_node -- string containing the url of the GNU Social node
  191. username -- string containing the user of GNU Social
  192. password -- string containing the password of GNU Social
  193. """
  194. pic = ""
  195. found = False
  196. api = gs_node + '/api/statusnet/media/upload'
  197. # If the picture doesn't exist or is not well written, show must go on
  198. try:
  199. html = urllib.request.urlopen('https://' + picture).read().decode(
  200. 'utf-8').splitlines()
  201. except:
  202. return picture
  203. # For debugging purposes
  204. all_parts = []
  205. for part in html:
  206. all_parts.append(part)
  207. logger.debug('Response: %s', all_parts)
  208. # Search the hardcoded tag name of the picture
  209. for part in html:
  210. if picture in part:
  211. found = True
  212. if 'data-image-url' in part and found is True:
  213. pic = part.split('"')[1]
  214. break
  215. # If there's a video instead of a picture, just exit
  216. if not pic:
  217. return None
  218. req = requests.get(pic)
  219. logger.debug('Response: %s', req.text)
  220. pic = req.content
  221. img = {'media': ('useless.jpg', pic)}
  222. if insecure == 'yes':
  223. req = requests.post(api, auth=(username, password), verify=False,
  224. files=img)
  225. else:
  226. req = requests.post(api, auth=(username, password), files=img)
  227. logger.debug('Response: %s', req.text)
  228. buffer = req.content
  229. xmldoc = minidom.parseString(buffer)
  230. item = xmldoc.getElementsByTagName('rsp')
  231. url = item.item(0).getElementsByTagName('mediaurl')[0].firstChild.data
  232. return url
  233. def compare(self, feeds):
  234. """
  235. Compare the picked feed to the saved on the database and return
  236. list of lists if new.
  237. Keyword argument:
  238. feeds -- list of lists containing all actual feeds on the RSS file
  239. """
  240. db = Database()
  241. old = db.select('select guid from items;')
  242. new_feed = []
  243. posted = []
  244. # make the list accesible
  245. for x in old:
  246. posted.append(x[0])
  247. for feed in feeds:
  248. if feed[4] not in posted:
  249. new_feed.append(feed)
  250. db.close()
  251. return new_feed
  252. def shortener(self, post):
  253. """
  254. Return a shortened url.
  255. Keyword argument:
  256. post -- string containing a url to be shortened
  257. """
  258. api = ('http://qttr.at/yourls-api.php?format=xml&action=shorturl'
  259. '&signature=b6afeec983&url=' + post)
  260. req = requests.post(api)
  261. logger.debug('Response: %s', req.text)
  262. buffer = req.content
  263. xmldoc = minidom.parseString(buffer)
  264. item = xmldoc.getElementsByTagName('result')
  265. url = item.item(0).getElementsByTagName('shorturl')[0].firstChild.data
  266. return url
  267. def shorten_all(self, post):
  268. """
  269. Short all the urls from a notice.
  270. Keyword arguments:
  271. post - list containing all the data related to the post to GS
  272. """
  273. # Regex taken from stackoverflow, thanks guys
  274. # It doesn't identify pic.twitter.com url, which is good
  275. urls = findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&~#=+]|[!*\(\),]'
  276. '|(?:%[0-9a-fA-F][0-9a-fA-F]))+', post[1])
  277. separate = post[1].split(' ')
  278. # Clean shitty carriage return
  279. tmp = []
  280. for i in separate:
  281. i = i.replace('\n', ' ')
  282. tmp.append(i)
  283. separate = tmp
  284. for i in urls:
  285. shortened = self.shortener(i)
  286. position = separate.index(i)
  287. separate[position] = shortened
  288. post[1] = ' '.join(separate)
  289. return post
  290. class Config:
  291. def create(self, config_name):
  292. """
  293. Create config file.
  294. Keyword argument:
  295. config_name -- string containing the config's name to be created
  296. """
  297. print('Hi! Now we\'ll create de config file!')
  298. feed = input('Please introduce the feed\'s url: ')
  299. username = input('Please introduce your username (user@server.com): ')
  300. password = input('Please introduce your password: ')
  301. shorten = input('Do you need to shorten the urls that you post? Please'
  302. ' take in account \nthat you should only use it if you'
  303. 'r node only has 140 characters. \nAnswer with "yes" o'
  304. 'r just press enter if you don\'t want to use it: ')
  305. fallback_feed = input('Please introduce your feed\'s fallback url. If '
  306. 'you don\'t want or have one,\njust press enter'
  307. ': ')
  308. print('Now we\'re going to fetch the feed. Please wait...')
  309. feed_file = feedparser.parse(feed)
  310. keys = list(feed_file.entries[0].keys())
  311. print('Done! The tags are: ')
  312. for tag in keys:
  313. print('\t' + tag)
  314. post_format = input('The XML has been parsed. Choose wich format you w'
  315. 'ant:\nPlease put the tags inside the square brack'
  316. 'ets\nEx: {title} - {link} by @{author}: ')
  317. insecure = input('Do you want to allow insecure connection to your GNU'
  318. ' social server?\nAnswer with "yes" or just press ent'
  319. 'er if you don\'t want to use it: ')
  320. config = configparser.ConfigParser()
  321. config['feeds'] = {}
  322. config['feeds']['feed'] = feed
  323. config['feeds']['user'] = username
  324. config['feeds']['password'] = password
  325. config['feeds']['shorten'] = shorten
  326. config['feeds']['fallback_feed'] = fallback_feed
  327. config['feeds']['format'] = post_format
  328. config['feeds']['insecure'] = insecure
  329. with open(config_name + '.ini', 'w') as configfile:
  330. config.write(configfile)
  331. def get(self, name):
  332. """
  333. Parse config file and return it on a list.
  334. Keyword arguments:
  335. name -- string containing the config's name
  336. """
  337. config = []
  338. parser = configparser.SafeConfigParser()
  339. parser.read(name)
  340. for name, value in parser.items('feeds'):
  341. config.append(value)
  342. return config
  343. class ParseOptions():
  344. """Parse command line options of this program."""
  345. def __init__(self):
  346. parser = argparse.ArgumentParser(description='Post feeds to GNU Social'
  347. '', prog='gnusrss')
  348. parser.add_argument('-c', '--create-config', metavar='file_name',
  349. dest='create_config', help='creates a config file')
  350. parser.add_argument('-C', '--create-db', dest='create_database',
  351. action='store_true', help='creates the database')
  352. parser.add_argument('-p', '--post', metavar='config_file', dest='post',
  353. help='posts feeds')
  354. parser.add_argument('-P', '--post-all', dest='post_all', action='store'
  355. '_true', help='posts all feeds')
  356. parser.add_argument('-k', '--populate-database', metavar='file_name',
  357. dest='populate_database', help='fetch the RSS and'
  358. ' save it in the database')
  359. parser.add_argument('-v', '--version', dest='version', action='store_t'
  360. 'rue', help='show version in the '
  361. 'database')
  362. parser.add_argument('-V', '--verbose', dest='verbose',
  363. metavar='level', help='be more verbose, choose bet'
  364. 'ween "info" or "debug"')
  365. self.db = Database()
  366. self.gs = GNUsrss()
  367. self.cnf = Config()
  368. self.args = parser.parse_args()
  369. # Make all options accesible within self
  370. self.create_database = self.args.create_database
  371. self.create_config = self.args.create_config
  372. self.post = self.args.post
  373. self.post_all = self.args.post_all
  374. self.populate_database = self.args.populate_database
  375. self.version = self.args.version
  376. self.verbose = self.args.verbose
  377. self.parser = parser
  378. def declare_config(self):
  379. """Assign all config parameters to a self object."""
  380. config = self.cnf.get(self.config_name)
  381. self.feed = config[0]
  382. self.user = config[1].split('@')[0]
  383. self.password = config[2]
  384. self.shorten = config[3]
  385. self.fallback_feed = config[4]
  386. self.format = config[5]
  387. # Always use SSL
  388. self.server = 'https://' + config[1].split('@')[1]
  389. # Test since in versions previous to 0.2.2 didn't exist
  390. try:
  391. self.insecure = config[6]
  392. except:
  393. self.insecure = ''
  394. def post_notice(self):
  395. """Post notice to GNU social."""
  396. file_name = self.config_name
  397. # If first feed and fallback feed aren't available, fail gracefully
  398. try:
  399. posts = self.gs.parse_feed(self.feed, self.format)
  400. except Exception as e:
  401. print(e)
  402. if self.fallback_feed:
  403. posts = self.gs.parse_feed(self.fallback_feed, self.format)
  404. else:
  405. print('There\'s been a problem with ' + file_name + ' file.')
  406. return None
  407. posts = list(reversed(posts))
  408. new = self.gs.compare(posts)
  409. if new:
  410. # Post only the older item
  411. self.to_post = new[0]
  412. if self.shorten == 'yes':
  413. self.to_post = self.gs.shorten_all(self.to_post)
  414. if not self.populate_database:
  415. code = self.gs.post(self.to_post, self.server, self.user,
  416. self.password, self.insecure)
  417. self.save_in_database(code)
  418. def save_in_database(self, code):
  419. """
  420. Save posts in database
  421. Keyword arguments:
  422. code -- HTML code of the notice's post to GNU social
  423. """
  424. if self.create_config or self.populate_database or int(code) == \
  425. int(200):
  426. self.db.insert_data([self.to_post[0], self.to_post[1], 1,
  427. self.to_post[2], self.to_post[3],
  428. self.to_post[4]])
  429. elif code != 200:
  430. print('The notice couldn\'t be posted')
  431. def pointers(self):
  432. """This are the options of the program."""
  433. if self.version:
  434. print("v0.2.3.1")
  435. exit()
  436. if self.verbose:
  437. if self.verbose == 2 or self.verbose == 'info':
  438. logger.setLevel(logging.INFO)
  439. elif self.verbose == 1 or self.verbose == 'debug':
  440. logger.setLevel(logging.DEBUG)
  441. if self.create_database:
  442. if path.exists('gnusrss.db'):
  443. overwrite = input('The database already exists. Are you '
  444. 'sure you want to overwrite it? (y/n) ')
  445. if overwrite == 'y':
  446. self.db.create_tables()
  447. else:
  448. self.db.create_tables()
  449. if not self.create_config and not self.populate_database and \
  450. not self.post and not self.post_all:
  451. self.db.close()
  452. if self.create_config:
  453. self.config_name = self.create_config + '.ini'
  454. self.cnf.create(self.create_config)
  455. populate = input('Do you want to populate the database? (y) Or you'
  456. ' prefer to post old items? (n) ')
  457. if populate == 'y':
  458. self.declare_config()
  459. posts = self.gs.parse_feed(self.feed, self.format)
  460. for post in posts:
  461. self.to_post = post
  462. self.save_in_database(0)
  463. self.db.close()
  464. elif self.post:
  465. self.config_name = self.post
  466. self.declare_config()
  467. self.post_notice()
  468. self.db.close()
  469. elif self.post_all:
  470. for config in listdir('.'):
  471. if config.endswith('.ini'):
  472. self.config_name = config
  473. self.declare_config()
  474. self.post_notice()
  475. self.db.close()
  476. elif self.populate_database:
  477. self.config_name = self.populate_database
  478. self.declare_config()
  479. posts = self.gs.parse_feed(self.feed, self.format)
  480. for post in posts:
  481. self.to_post = post
  482. self.save_in_database(0)
  483. self.db.close()
  484. elif len(argv) == 1:
  485. self.parser.print_help()
  486. if __name__ == "__main__":
  487. options = ParseOptions()
  488. options.pointers()