| 1 |
#!/usr/bin/env python |
| 2 |
# -*- coding: utf-8 -*- |
| 3 |
# vim: set fileencoding=utf-8 tabstop=4 shiftwidth=4 expandtab smartindent: |
| 4 |
u""" |
| 5 |
Créé un flux RSS avec les interventions de Jean-Marc Jancovici sur France |
| 6 |
Info. |
| 7 |
|
| 8 |
Les fichiers sonores sont placés dans des balises ``<enclosure/>`` pour faciliter le podcasting. |
| 9 |
|
| 10 |
:Authors: |
| 11 |
Aurélien Bompard <aurelien@bompard.org> <http://aurelien.bompard.org> |
| 12 |
|
| 13 |
:License: |
| 14 |
GNU GPL v3 or later |
| 15 |
""" |
| 16 |
|
| 17 |
import os |
| 18 |
import sys |
| 19 |
import urllib2 |
| 20 |
import urlparse |
| 21 |
import re |
| 22 |
import BeautifulSoup |
| 23 |
from pprint import pprint |
| 24 |
|
| 25 |
list_url = "http://www.france-info.com/rss/Le_regard_de_Jean-Marc_Jancovici.xml" |
| 26 |
base_url = "http://www.france-info.com" |
| 27 |
|
| 28 |
js_re = re.compile("jstoflash\('play','.*','','','([^']*\.mp3)'\);") |
| 29 |
|
| 30 |
list_page = urllib2.urlopen(list_url).read() |
| 31 |
list_content = BeautifulSoup.BeautifulStoneSoup(list_page) |
| 32 |
for item in list_content.find("atom:link").findAll(recursive=False): |
| 33 |
list_content.channel.insert(-1, item) |
| 34 |
list_content.find("atom:link").extract() |
| 35 |
for item in list_content.channel.findAll("item"): |
| 36 |
#print item.prettify() |
| 37 |
link = item.link.string |
| 38 |
#print link |
| 39 |
item_page = urllib2.urlopen(link).read() |
| 40 |
item_content = BeautifulSoup.BeautifulSoup(item_page) |
| 41 |
play_links = item_content.find("a", {"class": "p_ficheEcouter"}) |
| 42 |
js_link = play_links["onclick"] |
| 43 |
js_mo = js_re.search(js_link) |
| 44 |
podcast_rel = js_mo.group(1) |
| 45 |
podcast_url = urlparse.urljoin(base_url, podcast_rel) |
| 46 |
#print "URL:", podcast_url |
| 47 |
item.enclosure["url"] = podcast_url |
| 48 |
try: |
| 49 |
podcast_info = urllib2.urlopen(podcast_url).info() |
| 50 |
except urllib2.URLError, e: |
| 51 |
sys.stderr.write("Error getting podcast info from %s: %s\n" % |
| 52 |
(podcast_url, e)) |
| 53 |
item.enclosure["type"] = "audio/mpeg" # reasonable fallback |
| 54 |
del item.enclosure["length"] |
| 55 |
continue |
| 56 |
item.enclosure["type"] = podcast_info.getheader("Content-Type") |
| 57 |
item.enclosure["length"] = podcast_info.getheader("Content-Length") |
| 58 |
|
| 59 |
print list_content.prettify() |