X-Git-Url: http://dolda2000.com/gitweb/?p=utils.git;a=blobdiff_plain;f=ANN.pm;h=6dd02e94fc951f0b1773cf7f5131f6ecf3359386;hp=fbf08e55a88d0d943c556006168fd93890ff8a2e;hb=6602427b42eda9be83a4e74d3548736e062279b2;hpb=23d4abb2e29634e8d2e11879be91cec9cfd6ad13 diff --git a/ANN.pm b/ANN.pm index fbf08e5..6dd02e9 100644 --- a/ANN.pm +++ b/ANN.pm @@ -1,22 +1,46 @@ package Anime::ANN; +use POSIX; +use Digest::MD5; use LWP::UserAgent; use HTML::Entities; $ver = "0.1"; +$ua = LWP::UserAgent->new; +$ua->agent("ANNData/$ver "); +$usecache = 1; sub _get { - my($uri, $ua, $res); + my($uri, $cname, $res); ($uri) = @_; - $ua = LWP::UserAgent->new; - $ua->agent("ANNData/$ver "); - + $cname = $ENV{"HOME"} . "/.ann/"; + mkdir $cname unless -e $cname; + $cname .= "cache/"; + mkdir $cname unless -e $cname; + $cname .= Digest::MD5::md5_hex $uri; + if($usecache && -e $cname) { + my(@s); + @s = stat $cname; + if((time - $s[9]) < 86400) { + my($buf); + open CACHE, "<:utf8", $cname; + $res .= $buf while read CACHE, $buf, 1024; + close CACHE; + return $res; + } + } + $res = $ua->request(HTTP::Request->new("GET", "$uri")); - - die "could not fetch $uri\n" unless $res->is_success; - return $res->content; + + if(open CACHE, ">:utf8", $cname) { + print CACHE $res->decoded_content; + close CACHE; + } + + return undef unless $res->is_success; + return $res->decoded_content; } sub getlist @@ -24,16 +48,24 @@ sub getlist my($name, $il, $html, @ret); ($name) = @_; - $il = uc(($name =~ /^(.)/)[0]); + $name = ($name =~ /^(the\s+)?(.*)$/i)[1]; + $il = uc(($name =~ /^\W*(.)/)[0]); $il = "9" if (!($il =~ /[A-Z]/)); - $html = _get "http://www.animenewsnetwork.com/encyclopedia/anime.php?list=$il"; + if(!($html = _get "http://www.animenewsnetwork.com/encyclopedia/anime.php?list=$il")) { + return undef; + } # The only way to recognize entries that seems sure is to look # after the "HOVERLINE" class. - push @ret, $1 while $html =~ /.*([^<>]*$name[^<>]*)<\/FONT/ig; + while($html =~ /]*>(]*>)?([^<]*<\/small>)?\s*([^<]+)<\//ig) { + if((substr "" . lc $4 , 0, length $name) eq lc $name) { + push @ret, $4; + } + } + # push @ret, $1 while $html =~ /.*([^<>]*$name[^<>]*)<\/FONT/ig; - return(@ret); + return @ret; } sub getid @@ -41,16 +73,31 @@ sub getid my($name, $il, $html, $url); ($name) = @_; - $il = uc(($name =~ /^(.)/)[0]); + $name = ($name =~ /^(the\s+)?(.*)$/i)[1]; + $il = uc(($name =~ /^\W*(.)/)[0]); $il = "9" if (!($il =~ /[A-Z]/)); - $html = _get "http://www.animenewsnetwork.com/encyclopedia/anime.php?list=$il"; + if(!($html = _get "http://www.animenewsnetwork.com/encyclopedia/anime.php?list=$il")) { + return undef; + } # The only way to recognize entries that seems sure is to look # after the "HOVERLINE" class. - (($url) = ($html =~ /]*>(]*>)?([^<]*<\/small>)?\s*([^<]+)<\//ig) { + if((substr "" . lc $4 , 0, length $name) eq lc $name) { + return ($1 =~ /id=(\d+)$/)[0]; + } + } - return ($url =~ /\?id=(\d+)$/)[0]; + return undef; +} + +sub geturl +{ + my($id); + ($id) = @_; + + return "http://www.animenewsnetwork.com/encyclopedia/anime.php?id=$id"; } sub getthemes @@ -58,24 +105,24 @@ sub getthemes my($html, $kind, @ret); ($html, $kind) = @_; - if($html =~ /$kind theme:<\/b>\n/igc) { + if($html =~ /$kind theme:<\/strong>\s*\n/igc) { my(@parts, $ct, $buf); - while($html =~ /\G\    (([^<>]|\|<\/i>)+)/igc) { + while($html =~ /\G\s*\
(([^<>]|\|<\/i>)+)(]*>[^<>]*]*>[^<>]*<\/span>)?<\/div>/igc) { $buf = $1; - # 0 1 2 3 4 5 6 7 8 9 10 11 - if(@parts = ($buf =~ /(\#(\d+):)?\s*\"([^\"\(]+)(\s+\((\(.*)<\/i>(;\s*)?)?([^<>]+)?\))?\"\s+by\s+([^\(]*[^\(\s])(\s*\(eps (\d+)-(\d+)?\))?/i)) { + # 0 1 2 3 4 5 6 7 8 9 10 1112 + if(@parts = ($buf =~ /(\#(\d+):)?\s*\"([^\"\(]+\S)(\s*\((\(.*)<\/i>( - \s*)?)?([^<>]+)?\))?\"\s+by\s+([^\(]*[^\(\s])(\s*\(eps? (\d+)(-(\d+))?\))?/i)) { $ct = {}; $ct->{"num"} = $parts[1] if defined $parts[1]; if(defined $parts[5]) { - $ct->{"tit"} = $parts[5]; + $ct->{"tit"} = decode_entities($parts[5]); $ct->{"jat"} = decode_entities($parts[2]) if defined $parts[2]; } else { - $ct->{"tit"} = $parts[2] if defined $parts[2]; + $ct->{"tit"} = decode_entities($parts[2]) if defined $parts[2]; } - $ct->{"ent"} = $parts[7] if defined $parts[7]; - $ct->{"prf"} = $parts[8] if defined $parts[8]; + $ct->{"ent"} = decode_entities($parts[7]) if defined $parts[7]; + $ct->{"prf"} = decode_entities($parts[8]) if defined $parts[8]; $ct->{"fep"} = $parts[10] if defined $parts[10]; - $ct->{"lep"} = $parts[11] if defined $parts[11]; + $ct->{"lep"} = $parts[12] if defined $parts[12]; push @ret, $ct; } } @@ -89,16 +136,26 @@ sub getseries my($id, $buf, $html, %ret); ($id) = @_; - $html = _get "http://www.animenewsnetwork.com/encyclopedia/anime.php?id=$id"; + if(!($html = _get geturl $id)) { + return undef; + } - $ret{"url"} = "http://www.animenewsnetwork.com/encyclopedia/anime.php?id=$id"; - ($ret{"name"}) = ($html =~ /\Anime News Network - ([^<]*)<\/TITLE>/); - if(($buf) = ($html =~ /vintage:<\/b>\n([^<]+)([^<]*) - Anime News Network<\/title>/); + if($buf =~ /\([^\)]+\)$/) { + ($ret{"name"}, $ret{"type"}) = ($buf =~ /^(.*[^\s])\s*\(([^\)]+)\)$/); + } else { + $ret{"name"} = $buf; + } + if(($buf) = ($html =~ /vintage:<\/strong>\s*\n\s*([^<]+)\n([^<]+)\s*\n\s*([^<]+)\s*\n\s*([^<]+)