Decode HTML entities correctly.
[automanga.git] / manga / batoto.py
1 import urllib, re, BeautifulSoup
2 import lib, htcache
3 soup = BeautifulSoup.BeautifulSoup
4 soupify = lambda cont: soup(cont, convertEntities=soup.HTML_ENTITIES)
5
6 def byclass(el, name, cl):
7     for ch in el.findAll(name):
8         if not isinstance(ch, BeautifulSoup.Tag): continue
9         cll = ch.get("class", "")
10         if cl in cll.split():
11             return ch
12     return None
13
14 def nextel(el):
15     while True:
16         el = el.nextSibling
17         if isinstance(el, BeautifulSoup.Tag):
18             return el
19
20 class page(lib.page):
21     def __init__(self, chapter, stack, n, url):
22         self.stack = stack
23         self.chapter = chapter
24         self.n = n
25         self.id = str(n)
26         self.name = u"Page %s" % n
27         self.url = url
28         self.ciurl = None
29
30     def iurl(self):
31         if self.ciurl is None:
32             page = soupify(htcache.fetch(self.url))
33             img = nextel(page.find("div", id="full_image")).img
34             self.ciurl = img["src"].encode("us-ascii")
35         return self.ciurl
36
37     def open(self):
38         return lib.stdimgstream(self.iurl())
39
40     def __str__(self):
41         return self.name
42
43     def __repr(self):
44         return "<batoto.page %r.%r.%r>" % (self.chapter.manga.name, self.chapter.name, self.name)
45
46 class chapter(lib.pagelist):
47     def __init__(self, manga, stack, id, name, url):
48         self.stack = stack
49         self.manga = manga
50         self.id = id
51         self.name = name
52         self.url = url
53         self.cpag = None
54
55     def __getitem__(self, i):
56         return self.pages()[i]
57
58     def __len__(self):
59         return len(self.pages())
60
61     pnre = re.compile(r"page (\d+)")
62     def pages(self):
63         if self.cpag is None:
64             pg = soupify(htcache.fetch(self.url))
65             cpag = []
66             for opt in pg.find("select", id="page_select").findAll("option"):
67                 url = opt["value"].encode("us-ascii")
68                 n = int(self.pnre.match(opt.string).group(1))
69                 cpag.append(page(self, self.stack + [(self, len(cpag))], n, url))
70             self.cpag = cpag
71         return self.cpag
72
73     def __str__(self):
74         return self.name
75
76     def __repr__(self):
77         return "<batoto.chapter %r.%r>" % (self.manga.name, self.name)
78
79 class manga(lib.manga):
80     def __init__(self, lib, id, name, url):
81         self.lib = lib
82         self.id = id
83         self.name = name
84         self.url = url
85         self.cch = None
86         self.stack = []
87         self.cnames = None
88
89     def __getitem__(self, i):
90         return self.ch()[i]
91
92     def __len__(self):
93         return len(self.ch())
94
95     cure = re.compile(r"/read/_/(\d+)/[^/]*")
96     def ch(self):
97         if self.cch is None:
98             page = soupify(htcache.fetch(self.url))
99             cls = byclass(page, u"table", u"chapters_list")
100             if cls.tbody is not None:
101                 cls = cls.tbody
102             scl = u"lang_" + self.lib.lang
103             cch = []
104             for ch in cls.childGenerator():
105                 if isinstance(ch, BeautifulSoup.Tag) and ch.name == u"tr":
106                     cll = ch.get("class", "").split()
107                     if u"row" in cll and scl in cll:
108                         url = ch.td.a["href"].encode("us-ascii")
109                         m = self.cure.search(url)
110                         if m is None: raise Exception("Got weird chapter URL: %r" % url)
111                         cid = m.group(1)
112                         url = self.lib.base + "read/_/" + cid
113                         name = ch.td.a.text
114                         cch.append((cid, name, url))
115             cch.reverse()
116             rch = []
117             for n, (cid, name, url) in enumerate(cch):
118                 rch.append(chapter(self, [(self, n)], cid, name, url))
119             self.cch = rch
120         return self.cch
121
122     def altnames(self):
123         if self.cnames is None:
124             page = soupify(htcache.fetch(self.url))
125             cnames = None
126             for tbl in page.findAll("table", attrs={"class": "ipb_table"}):
127                 if tbl.tbody is not None: tbl = tbl.tbody
128                 for tr in tbl.findAll("tr"):
129                     if u"Alt Names:" in tr.td.text:
130                         nls = nextel(tr.td)
131                         if nls.name != u"td" or nls.span is None:
132                             raise Exception("Weird altnames table in " + self.id)
133                         cnames = [nm.text.strip() for nm in nls.findAll("span")]
134                         break
135                 if cnames is not None:
136                     break
137             if cnames is None:
138                 raise Exception("Could not find altnames for " + self.id)
139             self.cnames = cnames
140         return self.cnames
141
142     def __str__(self):
143         return self.name
144
145     def __repr__(self):
146         return "<batoto.manga %r>" % self.name
147
148 class library(lib.library):
149     def __init__(self):
150         self.base = "http://www.batoto.net/"
151         self.lang = u"English"
152
153     def byid(self, id):
154         url = self.base + "comic/_/comics/" + id
155         page = soupify(htcache.fetch(url))
156         title = page.find("h1", attrs={"class": "ipsType_pagetitle"})
157         if title is None:
158             raise KeyError(id)
159         return manga(self, id, title.string.strip(), url)
160
161     def _search(self, pars):
162         p = 1
163         while True:
164             _pars = dict(pars)
165             _pars["p"] = str(p)
166             resp = urllib.urlopen(self.base + "search?" + urllib.urlencode(_pars))
167             try:
168                 page = soupify(resp.read())
169             finally:
170                 resp.close()
171             rls = page.find("div", id="comic_search_results").table
172             if rls.tbody is not None:
173                 rls = rls.tbody
174             hasmore = False
175             for child in rls.findAll("tr"):
176                 if child.th is not None: continue
177                 if child.get("id", u"")[:11] == u"comic_rowo_": continue
178                 if child.get("id") == u"show_more_row":
179                     hasmore = True
180                     continue
181                 link = child.td.strong.a
182                 url = link["href"].encode("us-ascii")
183                 m = self.rure.search(url)
184                 if m is None: raise Exception("Got weird manga URL: %r" % url)
185                 id = m.group(1)
186                 name = link.text.strip()
187                 yield manga(self, id, name, url)
188             p += 1
189             if not hasmore:
190                 break
191
192     rure = re.compile(r"/comic/_/([^/]*)$")
193     def search(self, expr):
194         if not isinstance(expr, unicode):
195             expr = expr.decode("utf8")
196         return self._search({"name": expr.encode("utf8"), "name_cond": "c"})
197
198     def byname(self, prefix):
199         if not isinstance(prefix, unicode):
200             prefix = prefix.decode("utf8")
201         for res in self._search({"name": prefix.encode("utf8"), "name_cond": "s"}):
202             if res.name[:len(prefix)].lower() == prefix.lower():
203                 yield res
204             else:
205                 for aname in res.altnames():
206                     if aname[:len(prefix)].lower() == prefix.lower():
207                         yield manga(self, res.id, aname, res.url)
208                         break
209                 else:
210                     if False:
211                         print "eliding " + res.name
212                         print res.altnames()