From 90e80c0e500c2c2e7420da1a06500fdaca46e24a Mon Sep 17 00:00:00 2001 From: "Eric Richardson (sark)" Date: Tue, 8 Mar 2005 15:03:09 -0800 Subject: [PATCH] Initial import of podcast app. Currently supports downloading audio files to local machine, and (semi-untested) deleting old files to keep a certain number of current files. --- config | 29 ++++++ podcast | 320 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 349 insertions(+) create mode 100644 config create mode 100755 podcast diff --git a/config b/config new file mode 100644 index 0000000..870933c --- /dev/null +++ b/config @@ -0,0 +1,29 @@ + + + /home/eric/projects/podcast/cache + /home/eric/projects/podcast/audio + + http://kcrw.com/podcast/show/lr + 3 + LR: %Y/%m/%d #T# + Left, Right, & Center + + + http://kcrw.com/podcast/show/ww + 3 + WW: %Y/%m/%d #T# + Which Way, L.A.? + + + http://kcrw.com/podcast/show/de + 2 + D&A: %Y/%m/%d #T# + Design & Architecture + + + http://kcrw.com/podcast/show/tp + 3 + TP: %Y/%m/%d #T# + To The Point + + diff --git a/podcast b/podcast new file mode 100755 index 0000000..dab31e6 --- /dev/null +++ b/podcast @@ -0,0 +1,320 @@ +#!/usr/bin/perl + +#use XML::RAI; +use XML::DOM; +use XML::Simple; + +use URI::Fetch; + +use Cache::File; + +use DateTime; +use DateTime::Format::HTTP; + +use MP3::Mplib; + +use Storable; + +use strict; +use vars qw( $cfg $config $args $parser $cache ); + +$cfg = { + config => "/home/eric/projects/podcast/config", +}; + +#---------- + +{ + $parser = new XML::DOM::Parser; +# $parser = new XML::RSS::Parser; + + # -- open our config file -- # + + $config = XML::Simple::XMLin($cfg->{config}); + + # -- open our cache -- # + + $cache = Cache::File->new( cache_root => $config->{cache} ); + + # -- run through our feeds -- # + + while ( my ($n,$f) = each %{ $config->{feed} } ) { + next if ( $f->{disabled} ); + + warn "feed name: $n\n"; + + my $rss = URI::Fetch->fetch( $f->{url} , { + Cache => $cache + } ) or die URI::Fetch->errstr(); + + if ( $rss->status == URI::Fetch::URI_OK() ) { + # go on to main handling + my $items = &parse_feed( $f , $rss ); + &handle_items( $f , $items ); + } elsif ( $rss->status == URI::Fetch::URI_NOT_MODIFIED() ) { + # do nothing + } elsif ( $rss->status == URI::Fetch::URI_MOVED_PERMANENTLY() ) { + # we need to update the config file + warn "\n\nMOVING RSS URL FOR $n\nURI: ".$rss->uri."\n"; + $f->{oldurl} = $f->{url}; + $f->{url} = $rss->uri; + } elsif ( $rss->status == URI::Fetch::URI_GONE() ) { + # we'll disable it in the config file, but leave it in there + $f->{ disabled } = 1; + } else { + die "odd status code: $rss->status\n"; + } + } +} + +#---------- + +sub parse_feed { + my ($f,$rss) = @_; + + my $content = $rss->content; + + my $xml = $parser->parse($rss->content,ProtocolEncoding=>"ISO-8859-1"); + + # -- parse through items in the feed -- # + + my $items = []; + + foreach my $item ( $xml->getElementsByTagName('item') ) { + # -- first we'll make a hash with title/desc/etc -- # + my $info = {}; + foreach my $c ( $item->getChildNodes ) { + next if ($c->getNodeName !~ / + ^( + title + | link + | description + | pubdate + | dc:date + | dcterms:created + ) + /ix); + + my $name = $1; + + my $text = $c->getFirstChild->getData; + + # clean it up a bit + $text =~ s!(?:^\s+|\s+$)!!g; + + $info->{ lc($name) } = $text; + } + + # -- now handle some preferred fields -- # + + foreach my $a ( + ['created','dcterms:created','dc:date','pubdate'] + ) { + my $f = shift @$a; + foreach my $p (@$a) { + next if (!$info->{$p}); + $info->{$f} = $info->{$p}; + last; + } + } + + # -- now handle the enclosure -- # + { + my $elist = $item->getElementsByTagName('enclosure'); + + # this list should only have one member + warn "item has more than one enclosure?\n" + if ($elist->getLength > 1); + + my $enc = $elist->item(0); + + if ($enc) { + my $ei = $info->{enclosure} = {}; + + my $attributes = $enc->getAttributes; + if ($attributes) { + for my $i ( 0 .. $attributes->getLength-1 ) { + my $attr = $attributes->item($i); + $ei->{ lc($attr->getName) } = $attr->getValue; + } + } + } else { + # boo... item has no audio + } + } + + push @$items , $info; + } + + return $items; +} + +#---------- + +sub handle_items { + my ($f,$items) = @_; + + # -- find out what files we already know about -- # + + my $known; + if ( my $blob = $cache->get( "known_files." . $f->{url} ) ) { + $known = Storable::thaw( $blob ); + } else { + $known = []; + } + + # -- get new files -- # + + my $current = []; + + # we assume that the podcast feed is always going to have our newest + # file(s), so we start there with the idea that if we don't reach our + # count we'll keep around a couple we already know + + { + my $c = 0; + foreach my $i (@$items) { +# warn "item date: $i->{created}\n"; + + my $date + = DateTime::Format::HTTP->parse_datetime($i->{created}); + +# print "date: " . $date->strftime("%Y/%m/%d") . "\n"; + + if ($i->{enclosure} && $c < $f->{count}) { + if ( &download_audio($f,$i,$date) ) { + warn "onto current: " . &local_file_from_url($i) . "\n"; + push @$current, [ $date->epoch , &local_file_from_url($i) ]; + $c++; + } else { + # nothing + } + } else { + # no audio or we've reached our limit + } + } + + # map curent files to filenames + my $files = {}; + %$files = map { $_->[1] => 1 } @$current; + + # -- fill in missing current files -- # + + if ($c < $f->{count}) { + foreach my $k (@$known) { + next if ($files->{ $k->[1] }); + warn "adding $k->[1] to current\n"; + push @$current, $k; + $c++; + last unless ($c < $f->{count}); + } + } + + # (refresh) map curent files to filenames + %$files = map { $_->[1] => 1 } @$current; + + # -- delete old known files -- # + + foreach my $k (@$known) { + # skip it if the file name's in current + next if ($files->{ $k->[1] }); + + my $lfile = $config->{local} . "/" . $k->[1]; + # if it's not, out it goes + warn "deleting $lfile\n"; + #`rm $lfile`; + } + } + # -- store a list of what we know -- # + + $cache->set( "known_files." . $f->{url} , Storable::nfreeze( $current ) ); + + return 1; +} + +#---------- + +sub download_audio { + my ($f,$i,$date) = @_; + + # -- split the filename off the url -- # + + my $file = $config->{local} . "/" . &local_file_from_url($i); +# warn "file: $file\n"; + + # -- first make sure we don't already have the file -- # + + return 1 if (-e $file); + + # -- grab the file -- # + + `wget -O $file $i->{enclosure}{url}`; + + # -- change file info if desired -- # + + my $mp3 = MP3::Mplib->new($file); + + my $id3v2 = $mp3->get_v2tag(); + + foreach my $key (['title','TIT2'],['artist','TPE1'],['album','TALB']) { + next if (!$f->{ $key->[0] }); + + my $value = $date->strftime($f->{ $key->[0] }); + + $value =~ s!#T#!$i->{title}!i; + $value =~ s!#N#!$i->{name}!i; + + $id3v2->{ $key->[1] } = $value; + } + + $mp3->set_v2tag($id3v2); + + return 1; +} + +#---------- + +sub local_file_from_url { + my $i = shift; + + my ($file) = $i->{enclosure}{url} =~ m!.*/([^/]+)$!; + + return $file; +} + +#---------- + +sub handle_feed_rai { + my ($f,$rss) = @_; + + # stupid XML::RAI and XML::RSS::Parser don't pass any extra args on + # to XML::Parser or XML::Expat, so we'll have to go around them + + my $xml = + XML::RAI::new( + $parser->parse($rss->content,ProtocolEncoding=>"ISO-8859-1") + ); + +# my $xml = XML::RAI->parse($rss->content,ProtocolEncoding=>"ISO-8859-1"); + + my $items = []; + + foreach my $item ( @{ $xml->items } ) { + my $info = {}; + foreach my $t ('title','link','description','created') { + $info->{ $t } = $item->$t; + } + + warn "item: $info->{title}\n"; + } +} + +package XML::RSS::Parser; + +sub parse { + my $class = shift; + $class->rss_normalize($class->SUPER::parse(@_)); +} + +1; + -- 2.1.4