WIP: Trying to get Batoto to work with new site.
[automanga.git] / manga / batoto.py
1 import urllib.request, urllib.parse, http.cookiejar, re, bs4, os
2 from . import profile, lib, htcache
3 soup = bs4.BeautifulSoup
4 soupify = lambda cont: soup(cont, "html.parser")
5
6 class pageerror(Exception):
7     def __init__(self, message, page):
8         super().__init__(message)
9         self.page = page
10
11 def byclass(el, name, cl):
12     for ch in el.findAll(name):
13         if not isinstance(ch, bs4.Tag): continue
14         cll = ch.get("class", [])
15         if cl in cll:
16             return ch
17     return None
18
19 def nextel(el):
20     while True:
21         el = el.nextSibling
22         if isinstance(el, bs4.Tag):
23             return el
24
25 class page(lib.page):
26     def __init__(self, chapter, stack, n, url):
27         self.stack = stack
28         self.chapter = chapter
29         self.n = n
30         self.id = str(n)
31         self.name = "Page %s" % n
32         self.url = url
33         self.ciurl = None
34
35     def iurl(self):
36         if self.ciurl is None:
37             page = soupify(htcache.fetch(self.url))
38             img = nextel(page.find("div", id="full_image")).img
39             self.ciurl = img["src"]
40         return self.ciurl
41
42     def open(self):
43         return lib.stdimgstream(self.iurl())
44
45     def __str__(self):
46         return self.name
47
48     def __repr(self):
49         return "<batoto.page %r.%r.%r>" % (self.chapter.manga.name, self.chapter.name, self.name)
50
51 class chapter(lib.pagelist):
52     def __init__(self, manga, stack, id, name, url):
53         self.stack = stack
54         self.manga = manga
55         self.id = id
56         self.name = name
57         self.url = url
58         self.cpag = None
59
60     def __getitem__(self, i):
61         return self.pages()[i]
62
63     def __len__(self):
64         return len(self.pages())
65
66     pnre = re.compile(r"page (\d+)")
67     def pages(self):
68         if self.cpag is None:
69             pg = soupify(htcache.fetch(self.url))
70             cpag = []
71             for opt in pg.find("select", id="page_select").findAll("option"):
72                 url = opt["value"]
73                 n = int(self.pnre.match(opt.string).group(1))
74                 cpag.append(page(self, self.stack + [(self, len(cpag))], n, url))
75             self.cpag = cpag
76         return self.cpag
77
78     def __str__(self):
79         return self.name
80
81     def __repr__(self):
82         return "<batoto.chapter %r.%r>" % (self.manga.name, self.name)
83
84 class manga(lib.manga):
85     def __init__(self, lib, id, name, url):
86         self.lib = lib
87         self.sess = lib.sess
88         self.id = id
89         self.name = name
90         self.url = url
91         self.cch = None
92         self.stack = []
93         self.cnames = None
94
95     def __getitem__(self, i):
96         return self.ch()[i]
97
98     def __len__(self):
99         return len(self.ch())
100
101     @staticmethod
102     def vfylogin(page):
103         if page.find("div", id="register_notice"):
104             return False
105         if not byclass(page, "table", "chapters_list"):
106             return False
107         return True
108
109     cure = re.compile(r"/read/_/(\d+)/[^/]*")
110     def ch(self):
111         if self.cch is None:
112             page = self.sess.lfetch(self.url, self.vfylogin)
113             cls = byclass(page, "table", "chapters_list")
114             if cls.tbody is not None:
115                 cls = cls.tbody
116             scl = "lang_" + self.lib.lang
117             cch = []
118             for ch in cls.childGenerator():
119                 if isinstance(ch, bs4.Tag) and ch.name == "tr":
120                     cll = ch.get("class", [])
121                     if "row" in cll and scl in cll:
122                         url = ch.td.a["href"]
123                         m = self.cure.search(url)
124                         if m is None: raise pageerror("Got weird chapter URL: %r" % url, page)
125                         cid = m.group(1)
126                         url = self.lib.base + "read/_/" + cid
127                         name = ch.td.a.text
128                         cch.append((cid, name, url))
129             cch.reverse()
130             rch = []
131             for n, (cid, name, url) in enumerate(cch):
132                 rch.append(chapter(self, [(self, n)], cid, name, url))
133             self.cch = rch
134         return self.cch
135
136     def altnames(self):
137         if self.cnames is None:
138             page = soupify(self.sess.fetch(self.url))
139             cnames = None
140             for tbl in page.findAll("table", attrs={"class": "ipb_table"}):
141                 if tbl.tbody is not None: tbl = tbl.tbody
142                 for tr in tbl.findAll("tr"):
143                     if "Alt Names:" in tr.td.text:
144                         nls = nextel(tr.td)
145                         if nls.name != "td" or nls.span is None:
146                             raise pageerror("Weird altnames table in " + self.id, page)
147                         cnames = [nm.text.strip() for nm in nls.findAll("span")]
148                         break
149                 if cnames is not None:
150                     break
151             if cnames is None:
152                 raise pageerror("Could not find altnames for " + self.id, page)
153             self.cnames = cnames
154         return self.cnames
155
156     def __str__(self):
157         return self.name
158
159     def __repr__(self):
160         return "<batoto.manga %r>" % self.name
161
162 class credentials(object):
163     def __init__(self, username, password):
164         self.username = username
165         self.password = password
166
167     @classmethod
168     def fromfile(cls, path):
169         username, password = None, None
170         with open(path) as fp:
171             for words in profile.splitlines(fp):
172                 if words[0] == "username":
173                     username = words[1]
174                 elif words[0] == "password":
175                     password = words[1]
176                 elif words[0] == "pass64":
177                     import binascii
178                     password = binascii.a2b_base64(words[1]).decode("utf8")
179         if None in (username, password):
180             raise ValueError("Incomplete profile: " + path)
181         return cls(username, password)
182
183     @classmethod
184     def default(cls):
185         path = os.path.join(profile.confdir, "batoto")
186         if os.path.exists(path):
187             return cls.fromfile(path)
188         return None
189
190 class session(object):
191     def __init__(self, base, credentials):
192         self.base = base
193         self.creds = credentials
194         self.jar = http.cookiejar.CookieJar()
195         self.web = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(self.jar))
196         self.loggedin = False
197
198     rlre = re.compile(r"Welcome, (.*) ")
199     def dologin(self):
200         with self.web.open(self.base) as hs:
201             page = soupify(hs.read())
202
203         cur = page.find("a", id="user_link")
204         print(0)
205         if cur:
206             m = self.rlre.search(cur.get_text())
207             if not m or m.group(1) != self.creds.username:
208                 print(1)
209                 outurl = None
210                 nav = page.find("div", id="user_navigation")
211                 if nav:
212                     for li in nav.findAll("li"):
213                         if li.a and "Sign Out" in li.a.string:
214                             outurl = li.a["href"]
215                 if not outurl:
216                     raise pageerror("Could not find logout URL", page)
217                 with self.wep.open(outurl) as hs:
218                     hs.read()
219                 with self.web.open(self.base) as hs:
220                     page = soupify(hs.read())
221             else:
222                 print(2)
223                 return
224         else:
225             print(3)
226
227         form = page.find("form", id="login")
228         values = {}
229         for el in form.findAll("input", type="hidden"):
230             values[el["name"]] = el["value"]
231         values["ips_username"] = self.creds.username
232         values["ips_password"] = self.creds.password
233         values["anonymous"] = "1"
234         req = urllib.request.Request(form["action"], urllib.parse.urlencode(values).encode("ascii"))
235         with self.web.open(req) as hs:
236             page = soupify(hs.read())
237         for resp in page.findAll("p", attrs={"class": "message"}):
238             if resp.strong and "You are now signed in" in resp.strong.string:
239                 break
240         else:
241             raise pageerror("Could not log in", page)
242
243     def login(self):
244         if not self.loggedin:
245             if self.creds:
246                 self.dologin()
247             self.loggedin = True
248
249     def open(self, url):
250         return self.web.open(url)
251
252     def fetch(self, url):
253         with self.open(url) as hs:
254             return hs.read()
255
256     def lfetch(self, url, ck):
257         page = soupify(self.fetch(url))
258         if not ck(page):
259             self.login()
260             page = soupify(self.fetch(url))
261             if not ck(page):
262                 raise pageerror("Could not verify login status despite having logged in", page)
263         return page
264
265 class library(lib.library):
266     def __init__(self, *, creds=None):
267         if creds is None:
268             creds = credentials.default()
269         self.base = "http://bato.to/"
270         self.sess = session(self.base, creds)
271         self.lang = "English"
272
273     def byid(self, id):
274         url = self.base + "comic/_/comics/" + id
275         page = soupify(self.sess.fetch(url))
276         title = page.find("h1", attrs={"class": "ipsType_pagetitle"})
277         if title is None:
278             raise KeyError(id)
279         return manga(self, id, title.string.strip(), url)
280
281     def _search(self, pars):
282         p = 1
283         while True:
284             _pars = dict(pars)
285             _pars["p"] = str(p)
286             resp = urllib.request.urlopen(self.base + "search?" + urllib.parse.urlencode(_pars).encode("ascii"))
287             try:
288                 page = soupify(resp.read())
289             finally:
290                 resp.close()
291             rls = page.find("div", id="comic_search_results").table
292             if rls.tbody is not None:
293                 rls = rls.tbody
294             hasmore = False
295             for child in rls.findAll("tr"):
296                 if child.th is not None: continue
297                 if child.get("id", "")[:11] == "comic_rowo_": continue
298                 if child.get("id") == "show_more_row":
299                     hasmore = True
300                     continue
301                 link = child.td.strong.a
302                 url = link["href"]
303                 m = self.rure.search(url)
304                 if m is None: raise Exception("Got weird manga URL: %r" % url)
305                 id = m.group(1)
306                 name = link.text.strip()
307                 yield manga(self, id, name, url)
308             p += 1
309             if not hasmore:
310                 break
311
312     rure = re.compile(r"/comic/_/([^/]*)$")
313     def search(self, expr):
314         return self._search({"name": expr, "name_cond": "c"})
315
316     def byname(self, prefix):
317         for res in self._search({"name": prefix, "name_cond": "s"}):
318             if res.name[:len(prefix)].lower() == prefix.lower():
319                 yield res
320             else:
321                 for aname in res.altnames():
322                     if aname[:len(prefix)].lower() == prefix.lower():
323                         yield manga(self, res.id, aname, res.url)
324                         break
325                 else:
326                     if False:
327                         print("eliding " + res.name)
328                         print(res.altnames())