X-Git-Url: http://dolda2000.com/gitweb/?p=utils.git;a=blobdiff_plain;f=ANN.pm;h=6dd02e94fc951f0b1773cf7f5131f6ecf3359386;hp=ce85378553e89a9361808570c757c5345ec91ca6;hb=HEAD;hpb=b41d8fa9e6e09d448819e495c22adf23c4b02a10 diff --git a/ANN.pm b/ANN.pm index ce85378..6dd02e9 100644 --- a/ANN.pm +++ b/ANN.pm @@ -35,12 +35,12 @@ sub _get $res = $ua->request(HTTP::Request->new("GET", "$uri")); if(open CACHE, ">:utf8", $cname) { - print CACHE $res->content; + print CACHE $res->decoded_content; close CACHE; } return undef unless $res->is_success; - return $res->content; + return $res->decoded_content; } sub getlist @@ -48,7 +48,8 @@ sub getlist my($name, $il, $html, @ret); ($name) = @_; - $il = uc(($name =~ /^(.)/)[0]); + $name = ($name =~ /^(the\s+)?(.*)$/i)[1]; + $il = uc(($name =~ /^\W*(.)/)[0]); $il = "9" if (!($il =~ /[A-Z]/)); if(!($html = _get "http://www.animenewsnetwork.com/encyclopedia/anime.php?list=$il")) { return undef; @@ -57,7 +58,12 @@ sub getlist # The only way to recognize entries that seems sure is to look # after the "HOVERLINE" class. - push @ret, $1 while $html =~ /.*([^<>]*$name[^<>]*)<\/FONT/ig; + while($html =~ /]*>(]*>)?([^<]*<\/small>)?\s*([^<]+)<\//ig) { + if((substr "" . lc $4 , 0, length $name) eq lc $name) { + push @ret, $4; + } + } + # push @ret, $1 while $html =~ /.*([^<>]*$name[^<>]*)<\/FONT/ig; return @ret; } @@ -67,7 +73,8 @@ sub getid my($name, $il, $html, $url); ($name) = @_; - $il = uc(($name =~ /^(.)/)[0]); + $name = ($name =~ /^(the\s+)?(.*)$/i)[1]; + $il = uc(($name =~ /^\W*(.)/)[0]); $il = "9" if (!($il =~ /[A-Z]/)); if(!($html = _get "http://www.animenewsnetwork.com/encyclopedia/anime.php?list=$il")) { return undef; @@ -76,9 +83,13 @@ sub getid # The only way to recognize entries that seems sure is to look # after the "HOVERLINE" class. - (($url) = ($html =~ /]*>(]*>)?([^<]*<\/small>)?\s*([^<]+)<\//ig) { + if((substr "" . lc $4 , 0, length $name) eq lc $name) { + return ($1 =~ /id=(\d+)$/)[0]; + } + } - return ($url =~ /\?id=(\d+)$/)[0]; + return undef; } sub geturl @@ -94,12 +105,12 @@ sub getthemes my($html, $kind, @ret); ($html, $kind) = @_; - if($html =~ /$kind theme:<\/b>\n/igc) { + if($html =~ /$kind theme:<\/strong>\s*\n/igc) { my(@parts, $ct, $buf); - while($html =~ /\G\    (([^<>]|\|<\/i>)+)/igc) { + while($html =~ /\G\s*\
(([^<>]|\|<\/i>)+)(]*>[^<>]*]*>[^<>]*<\/span>)?<\/div>/igc) { $buf = $1; - # 0 1 2 3 4 5 6 7 8 9 10 11 - if(@parts = ($buf =~ /(\#(\d+):)?\s*\"([^\"\(]+)(\s+\((\(.*)<\/i>(;\s*)?)?([^<>]+)?\))?\"\s+by\s+([^\(]*[^\(\s])(\s*\(eps (\d+)-(\d+)?\))?/i)) { + # 0 1 2 3 4 5 6 7 8 9 10 1112 + if(@parts = ($buf =~ /(\#(\d+):)?\s*\"([^\"\(]+\S)(\s*\((\(.*)<\/i>( - \s*)?)?([^<>]+)?\))?\"\s+by\s+([^\(]*[^\(\s])(\s*\(eps? (\d+)(-(\d+))?\))?/i)) { $ct = {}; $ct->{"num"} = $parts[1] if defined $parts[1]; if(defined $parts[5]) { @@ -111,7 +122,7 @@ sub getthemes $ct->{"ent"} = decode_entities($parts[7]) if defined $parts[7]; $ct->{"prf"} = decode_entities($parts[8]) if defined $parts[8]; $ct->{"fep"} = $parts[10] if defined $parts[10]; - $ct->{"lep"} = $parts[11] if defined $parts[11]; + $ct->{"lep"} = $parts[12] if defined $parts[12]; push @ret, $ct; } } @@ -130,19 +141,19 @@ sub getseries } $ret{"url"} = geturl $id; - ($buf) = ($html =~ /\Anime News Network - ([^<]*)<\/TITLE>/); + ($buf) = ($html =~ /\([^<]*) - Anime News Network<\/title>/); if($buf =~ /\([^\)]+\)$/) { ($ret{"name"}, $ret{"type"}) = ($buf =~ /^(.*[^\s])\s*\(([^\)]+)\)$/); } else { $ret{"name"} = $buf; } - if(($buf) = ($html =~ /vintage:<\/b>\n([^<]+)\s*\n\s*([^<]+)\n([^<]+)\s*\n\s*([^<]+)\n([^<]+)\s*\n\s*([^<]+)