WIP: Trying to get Batoto to work with new site.
[automanga.git] / manga / batoto.py
CommitLineData
81be6921
FT
1import urllib.request, urllib.parse, http.cookiejar, re, bs4, os
2from . import profile, lib, htcache
3soup = bs4.BeautifulSoup
4soupify = lambda cont: soup(cont, "html.parser")
5
6class pageerror(Exception):
7 def __init__(self, message, page):
8 super().__init__(message)
9 self.page = page
08e259d7
FT
10
11def byclass(el, name, cl):
12 for ch in el.findAll(name):
81be6921
FT
13 if not isinstance(ch, bs4.Tag): continue
14 cll = ch.get("class", [])
15 if cl in cll:
08e259d7
FT
16 return ch
17 return None
18
19def nextel(el):
20 while True:
21 el = el.nextSibling
81be6921 22 if isinstance(el, bs4.Tag):
08e259d7
FT
23 return el
24
25class page(lib.page):
26 def __init__(self, chapter, stack, n, url):
27 self.stack = stack
28 self.chapter = chapter
29 self.n = n
30 self.id = str(n)
81be6921 31 self.name = "Page %s" % n
08e259d7
FT
32 self.url = url
33 self.ciurl = None
34
35 def iurl(self):
36 if self.ciurl is None:
c0d3b1a2 37 page = soupify(htcache.fetch(self.url))
08e259d7 38 img = nextel(page.find("div", id="full_image")).img
81be6921 39 self.ciurl = img["src"]
08e259d7
FT
40 return self.ciurl
41
42 def open(self):
43 return lib.stdimgstream(self.iurl())
44
45 def __str__(self):
46 return self.name
47
48 def __repr(self):
49 return "<batoto.page %r.%r.%r>" % (self.chapter.manga.name, self.chapter.name, self.name)
50
51class chapter(lib.pagelist):
52 def __init__(self, manga, stack, id, name, url):
53 self.stack = stack
54 self.manga = manga
55 self.id = id
56 self.name = name
57 self.url = url
58 self.cpag = None
59
60 def __getitem__(self, i):
61 return self.pages()[i]
62
63 def __len__(self):
64 return len(self.pages())
65
66 pnre = re.compile(r"page (\d+)")
67 def pages(self):
68 if self.cpag is None:
c0d3b1a2 69 pg = soupify(htcache.fetch(self.url))
08e259d7
FT
70 cpag = []
71 for opt in pg.find("select", id="page_select").findAll("option"):
81be6921 72 url = opt["value"]
08e259d7
FT
73 n = int(self.pnre.match(opt.string).group(1))
74 cpag.append(page(self, self.stack + [(self, len(cpag))], n, url))
75 self.cpag = cpag
76 return self.cpag
77
78 def __str__(self):
79 return self.name
80
81 def __repr__(self):
82 return "<batoto.chapter %r.%r>" % (self.manga.name, self.name)
83
84class manga(lib.manga):
85 def __init__(self, lib, id, name, url):
86 self.lib = lib
81be6921 87 self.sess = lib.sess
08e259d7
FT
88 self.id = id
89 self.name = name
90 self.url = url
91 self.cch = None
92 self.stack = []
1043cbdb 93 self.cnames = None
08e259d7
FT
94
95 def __getitem__(self, i):
96 return self.ch()[i]
97
98 def __len__(self):
99 return len(self.ch())
100
81be6921
FT
101 @staticmethod
102 def vfylogin(page):
103 if page.find("div", id="register_notice"):
104 return False
105 if not byclass(page, "table", "chapters_list"):
106 return False
107 return True
108
08e259d7
FT
109 cure = re.compile(r"/read/_/(\d+)/[^/]*")
110 def ch(self):
111 if self.cch is None:
81be6921
FT
112 page = self.sess.lfetch(self.url, self.vfylogin)
113 cls = byclass(page, "table", "chapters_list")
08e259d7
FT
114 if cls.tbody is not None:
115 cls = cls.tbody
81be6921 116 scl = "lang_" + self.lib.lang
08e259d7
FT
117 cch = []
118 for ch in cls.childGenerator():
81be6921
FT
119 if isinstance(ch, bs4.Tag) and ch.name == "tr":
120 cll = ch.get("class", [])
121 if "row" in cll and scl in cll:
122 url = ch.td.a["href"]
08e259d7 123 m = self.cure.search(url)
81be6921 124 if m is None: raise pageerror("Got weird chapter URL: %r" % url, page)
08e259d7
FT
125 cid = m.group(1)
126 url = self.lib.base + "read/_/" + cid
127 name = ch.td.a.text
687f2ed3 128 cch.append((cid, name, url))
08e259d7 129 cch.reverse()
687f2ed3
FT
130 rch = []
131 for n, (cid, name, url) in enumerate(cch):
132 rch.append(chapter(self, [(self, n)], cid, name, url))
133 self.cch = rch
08e259d7
FT
134 return self.cch
135
1043cbdb
FT
136 def altnames(self):
137 if self.cnames is None:
81be6921 138 page = soupify(self.sess.fetch(self.url))
1043cbdb
FT
139 cnames = None
140 for tbl in page.findAll("table", attrs={"class": "ipb_table"}):
141 if tbl.tbody is not None: tbl = tbl.tbody
142 for tr in tbl.findAll("tr"):
81be6921 143 if "Alt Names:" in tr.td.text:
1043cbdb 144 nls = nextel(tr.td)
81be6921
FT
145 if nls.name != "td" or nls.span is None:
146 raise pageerror("Weird altnames table in " + self.id, page)
1043cbdb
FT
147 cnames = [nm.text.strip() for nm in nls.findAll("span")]
148 break
149 if cnames is not None:
150 break
151 if cnames is None:
81be6921 152 raise pageerror("Could not find altnames for " + self.id, page)
1043cbdb
FT
153 self.cnames = cnames
154 return self.cnames
155
08e259d7
FT
156 def __str__(self):
157 return self.name
158
159 def __repr__(self):
160 return "<batoto.manga %r>" % self.name
161
81be6921
FT
162class credentials(object):
163 def __init__(self, username, password):
164 self.username = username
165 self.password = password
166
167 @classmethod
168 def fromfile(cls, path):
169 username, password = None, None
170 with open(path) as fp:
171 for words in profile.splitlines(fp):
172 if words[0] == "username":
173 username = words[1]
174 elif words[0] == "password":
175 password = words[1]
176 elif words[0] == "pass64":
177 import binascii
178 password = binascii.a2b_base64(words[1]).decode("utf8")
179 if None in (username, password):
180 raise ValueError("Incomplete profile: " + path)
181 return cls(username, password)
182
183 @classmethod
184 def default(cls):
185 path = os.path.join(profile.confdir, "batoto")
186 if os.path.exists(path):
187 return cls.fromfile(path)
188 return None
189
190class session(object):
191 def __init__(self, base, credentials):
192 self.base = base
193 self.creds = credentials
194 self.jar = http.cookiejar.CookieJar()
195 self.web = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(self.jar))
196 self.loggedin = False
197
198 rlre = re.compile(r"Welcome, (.*) ")
199 def dologin(self):
200 with self.web.open(self.base) as hs:
201 page = soupify(hs.read())
202
203 cur = page.find("a", id="user_link")
204 print(0)
205 if cur:
206 m = self.rlre.search(cur.get_text())
207 if not m or m.group(1) != self.creds.username:
208 print(1)
209 outurl = None
210 nav = page.find("div", id="user_navigation")
211 if nav:
212 for li in nav.findAll("li"):
213 if li.a and "Sign Out" in li.a.string:
214 outurl = li.a["href"]
215 if not outurl:
216 raise pageerror("Could not find logout URL", page)
217 with self.wep.open(outurl) as hs:
218 hs.read()
219 with self.web.open(self.base) as hs:
220 page = soupify(hs.read())
221 else:
222 print(2)
223 return
224 else:
225 print(3)
226
227 form = page.find("form", id="login")
228 values = {}
229 for el in form.findAll("input", type="hidden"):
230 values[el["name"]] = el["value"]
231 values["ips_username"] = self.creds.username
232 values["ips_password"] = self.creds.password
233 values["anonymous"] = "1"
234 req = urllib.request.Request(form["action"], urllib.parse.urlencode(values).encode("ascii"))
235 with self.web.open(req) as hs:
236 page = soupify(hs.read())
237 for resp in page.findAll("p", attrs={"class": "message"}):
238 if resp.strong and "You are now signed in" in resp.strong.string:
239 break
240 else:
241 raise pageerror("Could not log in", page)
242
243 def login(self):
244 if not self.loggedin:
245 if self.creds:
246 self.dologin()
247 self.loggedin = True
248
249 def open(self, url):
250 return self.web.open(url)
251
252 def fetch(self, url):
253 with self.open(url) as hs:
254 return hs.read()
255
256 def lfetch(self, url, ck):
257 page = soupify(self.fetch(url))
258 if not ck(page):
259 self.login()
260 page = soupify(self.fetch(url))
261 if not ck(page):
262 raise pageerror("Could not verify login status despite having logged in", page)
263 return page
264
08e259d7 265class library(lib.library):
81be6921
FT
266 def __init__(self, *, creds=None):
267 if creds is None:
268 creds = credentials.default()
269 self.base = "http://bato.to/"
270 self.sess = session(self.base, creds)
271 self.lang = "English"
08e259d7
FT
272
273 def byid(self, id):
274 url = self.base + "comic/_/comics/" + id
81be6921 275 page = soupify(self.sess.fetch(url))
08e259d7
FT
276 title = page.find("h1", attrs={"class": "ipsType_pagetitle"})
277 if title is None:
278 raise KeyError(id)
279 return manga(self, id, title.string.strip(), url)
280
24f0a3b7 281 def _search(self, pars):
1043cbdb
FT
282 p = 1
283 while True:
24f0a3b7
FT
284 _pars = dict(pars)
285 _pars["p"] = str(p)
81be6921 286 resp = urllib.request.urlopen(self.base + "search?" + urllib.parse.urlencode(_pars).encode("ascii"))
1043cbdb 287 try:
c0d3b1a2 288 page = soupify(resp.read())
1043cbdb
FT
289 finally:
290 resp.close()
291 rls = page.find("div", id="comic_search_results").table
292 if rls.tbody is not None:
293 rls = rls.tbody
294 hasmore = False
295 for child in rls.findAll("tr"):
296 if child.th is not None: continue
81be6921
FT
297 if child.get("id", "")[:11] == "comic_rowo_": continue
298 if child.get("id") == "show_more_row":
1043cbdb
FT
299 hasmore = True
300 continue
301 link = child.td.strong.a
81be6921 302 url = link["href"]
1043cbdb
FT
303 m = self.rure.search(url)
304 if m is None: raise Exception("Got weird manga URL: %r" % url)
305 id = m.group(1)
306 name = link.text.strip()
1043cbdb
FT
307 yield manga(self, id, name, url)
308 p += 1
309 if not hasmore:
310 break
24f0a3b7
FT
311
312 rure = re.compile(r"/comic/_/([^/]*)$")
313 def search(self, expr):
81be6921 314 return self._search({"name": expr, "name_cond": "c"})
24f0a3b7
FT
315
316 def byname(self, prefix):
81be6921 317 for res in self._search({"name": prefix, "name_cond": "s"}):
24f0a3b7
FT
318 if res.name[:len(prefix)].lower() == prefix.lower():
319 yield res
320 else:
321 for aname in res.altnames():
322 if aname[:len(prefix)].lower() == prefix.lower():
323 yield manga(self, res.id, aname, res.url)
324 break
325 else:
326 if False:
81be6921
FT
327 print("eliding " + res.name)
328 print(res.altnames())