I only started coding this on Wednesday so it's got a way to go yet:
- it reads a feed (live from website or from a local file),
- checks the size of each item (newznab:attr 'size'),
- if the title has particular words then grab it,
- if it is smaller than 700MB or larger than 15GB then ignore it.
- create XML for any items that make it through,
- display entire XML tree to screen.
Code: Select all
#!/usr/bin/env python
import re
import sys
import os
from urllib import urlopen
from bs4 import BeautifulSoup
from xml.etree.ElementTree import Element, SubElement, Comment, tostring
from xml.dom import minidom
import ConfigParser
def load_list_from_file(file):
# build list from each text file line
return [line.strip() for line in open(file, 'r')]
def load_list_from_dir(dir):
# get list of all subdirectories
mynames = next(os.walk("{}.".format(dir)))[1]
# remove compilation dirs like '(dump)'
mynames = [x for x in mynames if not re.match(r"\(.*?\)", x)]
# change dir names with '(ignore-audio)'
mynames = [x.replace(' (ignore-audio)', '') for x in mynames]
return mynames
def sizeof_fmt(num, suffix='B'):
# refer - http://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
if abs(num) < 1024.0:
return "%3.1f %s%s" % (num, unit, suffix)
num /= 1024.0
return "%.1f %s%s" % (num, 'Yi', suffix)
def convert_to_MB(number):
return(int(number / pow(1024, 2)))
def strip_desc(text):
# remove <description> XML tags surrounding CDATA block.
# this is a workaround so lxml doesn't choke on CDATA tags.
return re.sub(r"<description><!\[CDATA.*?\]\]></description>", "", text, flags=re.DOTALL)
def prettify(elem):
rough_string = tostring(elem, 'utf-8')
reparsed = minidom.parseString(rough_string)
return reparsed.toprettyxml(indent=" ", encoding='utf-8')
def init_xml_tree(title):
global rss_root
global rss_channel
rss_root = Element('rss')
rss_root.set('version', '2.0')
rss_channel = SubElement(rss_root, 'channel')
rss_title = SubElement(rss_channel, 'title')
rss_title.text = title
return 0
def append_to_xml_tree(title, link, category, size):
# add another 'item' to the main tree
global rss_channel
feed_item = SubElement(rss_channel, 'item')
feed_title = SubElement(feed_item, 'title')
feed_title.text = title
feed_link = SubElement(feed_item, 'link')
feed_link.text = link
feed_category = SubElement(feed_item, 'category')
feed_category.text = category
feed_size = SubElement(feed_item, 'size')
feed_size.text = size
return 0
LOWER_SIZE_LIMIT = 700 # (MB) files must be larger than this
UPPER_SIZE_LIMIT = 15000 # (MB) files must be smaller than this
script_path = os.path.dirname(os.path.realpath(sys.argv[0]))
script_name = os.path.basename(__file__) # this script's name without path
script_basename = os.path.splitext(script_name)[0] # this script's name without path or extension
config_file = script_path + "/rss-agg.cfg"
config = ConfigParser.ConfigParser()
config.read(config_file)
primary_feed = config.get('feeds', 'primary')
included_phrases_file = config.get('paths', 'included_phrases_file')
included_names_path = config.get('paths', 'included_names_path')
included_phrases = load_list_from_file(included_phrases_file) + load_list_from_dir(included_names_path)
included_phrases = [s.replace(' ', '.') for s in included_phrases]
included_phrases = [s.lower() for s in included_phrases]
excluded_phrases_file = config.get('paths', 'excluded_phrases_file')
excluded_phrases = load_list_from_file(excluded_phrases_file)
excluded_phrases = [s.replace(' ', '.') for s in excluded_phrases]
excluded_phrases = [s.lower() for s in excluded_phrases]
print("\n * keeping these phrases: ({} found)\n{}".format(len(included_phrases), included_phrases))
print("\n * discarding these phrases: ({} found)\n{}".format(len(excluded_phrases), excluded_phrases))
print("\n - reading feed...")
#handler = urlopen(primary_feed).read()
handler = open('feedcopy.rss.cfm').read()
handler = strip_desc(handler)
soup = BeautifulSoup(handler, 'lxml')
init_xml_tree(soup.description.text)
for item in soup.findAll('item'):
newznabs = item.findAll('newznab:attr')
newz_dict = {}
# build dictionary of newznab attributes
for attribute in newznabs:
newz_dict[attribute['name'].split(".")[0]] = attribute['value'].split(".")[0]
item_size = convert_to_MB(int(newz_dict['size']))
# print(" file size: {:,} Mbytes".format(item_size)),
# grab any items with these phrases
must_grab_flag = False
for phrase in included_phrases:
if phrase in item.title.text.lower():
must_grab_flag = True
break
# disregard anything with these phrases
should_grab_flag = True
for phrase in excluded_phrases:
if phrase in item.title.text.lower():
should_grab_flag = False
break
if must_grab_flag or should_grab_flag:
# disregard if too small or too large
if (item_size <= LOWER_SIZE_LIMIT) or (item_size >= UPPER_SIZE_LIMIT):
must_grab_flag = False
should_grab_flag = False
if must_grab_flag or should_grab_flag:
append_to_xml_tree(item.title.text, item.guid.text, item.category.text, sizeof_fmt(int(newz_dict['size'])))
print("\n * RSS XML output:\n{}".format(prettify(rss_root)))
and the config file is:
Code: Select all
[paths]
included_names_path = /names/
included_phrases_file = included_phrases.lst
excluded_phrases_file = excluded_phrases.lst
[feeds]
primary = https://website.com/rss.cfm?r=abc123
Currently working on loading a list of 'include' and 'exclude' words from file which will then be matched against the incoming feed (the Yahoo Pipes equivalent of a filter module).
I'm fairly new to Python so please excuse any glaring newbie-type issues that are present.
