Decode HTML entities correctly.
[automanga.git] / manga / batoto.py
CommitLineData
08e259d7
FT
1import urllib, re, BeautifulSoup
2import lib, htcache
3soup = BeautifulSoup.BeautifulSoup
c0d3b1a2 4soupify = lambda cont: soup(cont, convertEntities=soup.HTML_ENTITIES)
08e259d7
FT
5
6def byclass(el, name, cl):
7 for ch in el.findAll(name):
8 if not isinstance(ch, BeautifulSoup.Tag): continue
9 cll = ch.get("class", "")
10 if cl in cll.split():
11 return ch
12 return None
13
14def nextel(el):
15 while True:
16 el = el.nextSibling
17 if isinstance(el, BeautifulSoup.Tag):
18 return el
19
20class page(lib.page):
21 def __init__(self, chapter, stack, n, url):
22 self.stack = stack
23 self.chapter = chapter
24 self.n = n
25 self.id = str(n)
26 self.name = u"Page %s" % n
27 self.url = url
28 self.ciurl = None
29
30 def iurl(self):
31 if self.ciurl is None:
c0d3b1a2 32 page = soupify(htcache.fetch(self.url))
08e259d7
FT
33 img = nextel(page.find("div", id="full_image")).img
34 self.ciurl = img["src"].encode("us-ascii")
35 return self.ciurl
36
37 def open(self):
38 return lib.stdimgstream(self.iurl())
39
40 def __str__(self):
41 return self.name
42
43 def __repr(self):
44 return "<batoto.page %r.%r.%r>" % (self.chapter.manga.name, self.chapter.name, self.name)
45
46class chapter(lib.pagelist):
47 def __init__(self, manga, stack, id, name, url):
48 self.stack = stack
49 self.manga = manga
50 self.id = id
51 self.name = name
52 self.url = url
53 self.cpag = None
54
55 def __getitem__(self, i):
56 return self.pages()[i]
57
58 def __len__(self):
59 return len(self.pages())
60
61 pnre = re.compile(r"page (\d+)")
62 def pages(self):
63 if self.cpag is None:
c0d3b1a2 64 pg = soupify(htcache.fetch(self.url))
08e259d7
FT
65 cpag = []
66 for opt in pg.find("select", id="page_select").findAll("option"):
67 url = opt["value"].encode("us-ascii")
68 n = int(self.pnre.match(opt.string).group(1))
69 cpag.append(page(self, self.stack + [(self, len(cpag))], n, url))
70 self.cpag = cpag
71 return self.cpag
72
73 def __str__(self):
74 return self.name
75
76 def __repr__(self):
77 return "<batoto.chapter %r.%r>" % (self.manga.name, self.name)
78
79class manga(lib.manga):
80 def __init__(self, lib, id, name, url):
81 self.lib = lib
82 self.id = id
83 self.name = name
84 self.url = url
85 self.cch = None
86 self.stack = []
1043cbdb 87 self.cnames = None
08e259d7
FT
88
89 def __getitem__(self, i):
90 return self.ch()[i]
91
92 def __len__(self):
93 return len(self.ch())
94
95 cure = re.compile(r"/read/_/(\d+)/[^/]*")
96 def ch(self):
97 if self.cch is None:
c0d3b1a2 98 page = soupify(htcache.fetch(self.url))
08e259d7
FT
99 cls = byclass(page, u"table", u"chapters_list")
100 if cls.tbody is not None:
101 cls = cls.tbody
102 scl = u"lang_" + self.lib.lang
103 cch = []
104 for ch in cls.childGenerator():
105 if isinstance(ch, BeautifulSoup.Tag) and ch.name == u"tr":
106 cll = ch.get("class", "").split()
107 if u"row" in cll and scl in cll:
108 url = ch.td.a["href"].encode("us-ascii")
109 m = self.cure.search(url)
110 if m is None: raise Exception("Got weird chapter URL: %r" % url)
111 cid = m.group(1)
112 url = self.lib.base + "read/_/" + cid
113 name = ch.td.a.text
687f2ed3 114 cch.append((cid, name, url))
08e259d7 115 cch.reverse()
687f2ed3
FT
116 rch = []
117 for n, (cid, name, url) in enumerate(cch):
118 rch.append(chapter(self, [(self, n)], cid, name, url))
119 self.cch = rch
08e259d7
FT
120 return self.cch
121
1043cbdb
FT
122 def altnames(self):
123 if self.cnames is None:
c0d3b1a2 124 page = soupify(htcache.fetch(self.url))
1043cbdb
FT
125 cnames = None
126 for tbl in page.findAll("table", attrs={"class": "ipb_table"}):
127 if tbl.tbody is not None: tbl = tbl.tbody
128 for tr in tbl.findAll("tr"):
129 if u"Alt Names:" in tr.td.text:
130 nls = nextel(tr.td)
131 if nls.name != u"td" or nls.span is None:
132 raise Exception("Weird altnames table in " + self.id)
133 cnames = [nm.text.strip() for nm in nls.findAll("span")]
134 break
135 if cnames is not None:
136 break
137 if cnames is None:
138 raise Exception("Could not find altnames for " + self.id)
139 self.cnames = cnames
140 return self.cnames
141
08e259d7
FT
142 def __str__(self):
143 return self.name
144
145 def __repr__(self):
146 return "<batoto.manga %r>" % self.name
147
148class library(lib.library):
149 def __init__(self):
150 self.base = "http://www.batoto.net/"
151 self.lang = u"English"
152
153 def byid(self, id):
154 url = self.base + "comic/_/comics/" + id
c0d3b1a2 155 page = soupify(htcache.fetch(url))
08e259d7
FT
156 title = page.find("h1", attrs={"class": "ipsType_pagetitle"})
157 if title is None:
158 raise KeyError(id)
159 return manga(self, id, title.string.strip(), url)
160
24f0a3b7 161 def _search(self, pars):
1043cbdb
FT
162 p = 1
163 while True:
24f0a3b7
FT
164 _pars = dict(pars)
165 _pars["p"] = str(p)
166 resp = urllib.urlopen(self.base + "search?" + urllib.urlencode(_pars))
1043cbdb 167 try:
c0d3b1a2 168 page = soupify(resp.read())
1043cbdb
FT
169 finally:
170 resp.close()
171 rls = page.find("div", id="comic_search_results").table
172 if rls.tbody is not None:
173 rls = rls.tbody
174 hasmore = False
175 for child in rls.findAll("tr"):
176 if child.th is not None: continue
24f0a3b7 177 if child.get("id", u"")[:11] == u"comic_rowo_": continue
1043cbdb
FT
178 if child.get("id") == u"show_more_row":
179 hasmore = True
180 continue
181 link = child.td.strong.a
182 url = link["href"].encode("us-ascii")
183 m = self.rure.search(url)
184 if m is None: raise Exception("Got weird manga URL: %r" % url)
185 id = m.group(1)
186 name = link.text.strip()
1043cbdb
FT
187 yield manga(self, id, name, url)
188 p += 1
189 if not hasmore:
190 break
24f0a3b7
FT
191
192 rure = re.compile(r"/comic/_/([^/]*)$")
193 def search(self, expr):
194 if not isinstance(expr, unicode):
195 expr = expr.decode("utf8")
196 return self._search({"name": expr.encode("utf8"), "name_cond": "c"})
197
198 def byname(self, prefix):
199 if not isinstance(prefix, unicode):
200 prefix = prefix.decode("utf8")
201 for res in self._search({"name": prefix.encode("utf8"), "name_cond": "s"}):
202 if res.name[:len(prefix)].lower() == prefix.lower():
203 yield res
204 else:
205 for aname in res.altnames():
206 if aname[:len(prefix)].lower() == prefix.lower():
207 yield manga(self, res.id, aname, res.url)
208 break
209 else:
210 if False:
211 print "eliding " + res.name
212 print res.altnames()