From 81be6921788ee49cf63c91e4de3e6375b10654f6 Mon Sep 17 00:00:00 2001 From: Fredrik Tolf Date: Wed, 30 Dec 2015 16:16:02 +0100 Subject: [PATCH] WIP: Trying to get Batoto to work with new site. --- manga/batoto.py | 196 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 156 insertions(+), 40 deletions(-) diff --git a/manga/batoto.py b/manga/batoto.py index baa602f..37ab8b4 100644 --- a/manga/batoto.py +++ b/manga/batoto.py @@ -1,20 +1,25 @@ -import urllib, re, BeautifulSoup -import lib, htcache -soup = BeautifulSoup.BeautifulSoup -soupify = lambda cont: soup(cont, convertEntities=soup.HTML_ENTITIES) +import urllib.request, urllib.parse, http.cookiejar, re, bs4, os +from . import profile, lib, htcache +soup = bs4.BeautifulSoup +soupify = lambda cont: soup(cont, "html.parser") + +class pageerror(Exception): + def __init__(self, message, page): + super().__init__(message) + self.page = page def byclass(el, name, cl): for ch in el.findAll(name): - if not isinstance(ch, BeautifulSoup.Tag): continue - cll = ch.get("class", "") - if cl in cll.split(): + if not isinstance(ch, bs4.Tag): continue + cll = ch.get("class", []) + if cl in cll: return ch return None def nextel(el): while True: el = el.nextSibling - if isinstance(el, BeautifulSoup.Tag): + if isinstance(el, bs4.Tag): return el class page(lib.page): @@ -23,7 +28,7 @@ class page(lib.page): self.chapter = chapter self.n = n self.id = str(n) - self.name = u"Page %s" % n + self.name = "Page %s" % n self.url = url self.ciurl = None @@ -31,7 +36,7 @@ class page(lib.page): if self.ciurl is None: page = soupify(htcache.fetch(self.url)) img = nextel(page.find("div", id="full_image")).img - self.ciurl = img["src"].encode("us-ascii") + self.ciurl = img["src"] return self.ciurl def open(self): @@ -64,7 +69,7 @@ class chapter(lib.pagelist): pg = soupify(htcache.fetch(self.url)) cpag = [] for opt in pg.find("select", id="page_select").findAll("option"): - url = opt["value"].encode("us-ascii") + url = opt["value"] n = int(self.pnre.match(opt.string).group(1)) cpag.append(page(self, self.stack + [(self, len(cpag))], n, url)) self.cpag = cpag @@ -79,6 +84,7 @@ class chapter(lib.pagelist): class manga(lib.manga): def __init__(self, lib, id, name, url): self.lib = lib + self.sess = lib.sess self.id = id self.name = name self.url = url @@ -92,22 +98,30 @@ class manga(lib.manga): def __len__(self): return len(self.ch()) + @staticmethod + def vfylogin(page): + if page.find("div", id="register_notice"): + return False + if not byclass(page, "table", "chapters_list"): + return False + return True + cure = re.compile(r"/read/_/(\d+)/[^/]*") def ch(self): if self.cch is None: - page = soupify(htcache.fetch(self.url)) - cls = byclass(page, u"table", u"chapters_list") + page = self.sess.lfetch(self.url, self.vfylogin) + cls = byclass(page, "table", "chapters_list") if cls.tbody is not None: cls = cls.tbody - scl = u"lang_" + self.lib.lang + scl = "lang_" + self.lib.lang cch = [] for ch in cls.childGenerator(): - if isinstance(ch, BeautifulSoup.Tag) and ch.name == u"tr": - cll = ch.get("class", "").split() - if u"row" in cll and scl in cll: - url = ch.td.a["href"].encode("us-ascii") + if isinstance(ch, bs4.Tag) and ch.name == "tr": + cll = ch.get("class", []) + if "row" in cll and scl in cll: + url = ch.td.a["href"] m = self.cure.search(url) - if m is None: raise Exception("Got weird chapter URL: %r" % url) + if m is None: raise pageerror("Got weird chapter URL: %r" % url, page) cid = m.group(1) url = self.lib.base + "read/_/" + cid name = ch.td.a.text @@ -121,21 +135,21 @@ class manga(lib.manga): def altnames(self): if self.cnames is None: - page = soupify(htcache.fetch(self.url)) + page = soupify(self.sess.fetch(self.url)) cnames = None for tbl in page.findAll("table", attrs={"class": "ipb_table"}): if tbl.tbody is not None: tbl = tbl.tbody for tr in tbl.findAll("tr"): - if u"Alt Names:" in tr.td.text: + if "Alt Names:" in tr.td.text: nls = nextel(tr.td) - if nls.name != u"td" or nls.span is None: - raise Exception("Weird altnames table in " + self.id) + if nls.name != "td" or nls.span is None: + raise pageerror("Weird altnames table in " + self.id, page) cnames = [nm.text.strip() for nm in nls.findAll("span")] break if cnames is not None: break if cnames is None: - raise Exception("Could not find altnames for " + self.id) + raise pageerror("Could not find altnames for " + self.id, page) self.cnames = cnames return self.cnames @@ -145,14 +159,120 @@ class manga(lib.manga): def __repr__(self): return "" % self.name +class credentials(object): + def __init__(self, username, password): + self.username = username + self.password = password + + @classmethod + def fromfile(cls, path): + username, password = None, None + with open(path) as fp: + for words in profile.splitlines(fp): + if words[0] == "username": + username = words[1] + elif words[0] == "password": + password = words[1] + elif words[0] == "pass64": + import binascii + password = binascii.a2b_base64(words[1]).decode("utf8") + if None in (username, password): + raise ValueError("Incomplete profile: " + path) + return cls(username, password) + + @classmethod + def default(cls): + path = os.path.join(profile.confdir, "batoto") + if os.path.exists(path): + return cls.fromfile(path) + return None + +class session(object): + def __init__(self, base, credentials): + self.base = base + self.creds = credentials + self.jar = http.cookiejar.CookieJar() + self.web = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(self.jar)) + self.loggedin = False + + rlre = re.compile(r"Welcome, (.*) ") + def dologin(self): + with self.web.open(self.base) as hs: + page = soupify(hs.read()) + + cur = page.find("a", id="user_link") + print(0) + if cur: + m = self.rlre.search(cur.get_text()) + if not m or m.group(1) != self.creds.username: + print(1) + outurl = None + nav = page.find("div", id="user_navigation") + if nav: + for li in nav.findAll("li"): + if li.a and "Sign Out" in li.a.string: + outurl = li.a["href"] + if not outurl: + raise pageerror("Could not find logout URL", page) + with self.wep.open(outurl) as hs: + hs.read() + with self.web.open(self.base) as hs: + page = soupify(hs.read()) + else: + print(2) + return + else: + print(3) + + form = page.find("form", id="login") + values = {} + for el in form.findAll("input", type="hidden"): + values[el["name"]] = el["value"] + values["ips_username"] = self.creds.username + values["ips_password"] = self.creds.password + values["anonymous"] = "1" + req = urllib.request.Request(form["action"], urllib.parse.urlencode(values).encode("ascii")) + with self.web.open(req) as hs: + page = soupify(hs.read()) + for resp in page.findAll("p", attrs={"class": "message"}): + if resp.strong and "You are now signed in" in resp.strong.string: + break + else: + raise pageerror("Could not log in", page) + + def login(self): + if not self.loggedin: + if self.creds: + self.dologin() + self.loggedin = True + + def open(self, url): + return self.web.open(url) + + def fetch(self, url): + with self.open(url) as hs: + return hs.read() + + def lfetch(self, url, ck): + page = soupify(self.fetch(url)) + if not ck(page): + self.login() + page = soupify(self.fetch(url)) + if not ck(page): + raise pageerror("Could not verify login status despite having logged in", page) + return page + class library(lib.library): - def __init__(self): - self.base = "http://www.batoto.net/" - self.lang = u"English" + def __init__(self, *, creds=None): + if creds is None: + creds = credentials.default() + self.base = "http://bato.to/" + self.sess = session(self.base, creds) + self.lang = "English" def byid(self, id): url = self.base + "comic/_/comics/" + id - page = soupify(htcache.fetch(url)) + page = soupify(self.sess.fetch(url)) title = page.find("h1", attrs={"class": "ipsType_pagetitle"}) if title is None: raise KeyError(id) @@ -163,7 +283,7 @@ class library(lib.library): while True: _pars = dict(pars) _pars["p"] = str(p) - resp = urllib.urlopen(self.base + "search?" + urllib.urlencode(_pars)) + resp = urllib.request.urlopen(self.base + "search?" + urllib.parse.urlencode(_pars).encode("ascii")) try: page = soupify(resp.read()) finally: @@ -174,12 +294,12 @@ class library(lib.library): hasmore = False for child in rls.findAll("tr"): if child.th is not None: continue - if child.get("id", u"")[:11] == u"comic_rowo_": continue - if child.get("id") == u"show_more_row": + if child.get("id", "")[:11] == "comic_rowo_": continue + if child.get("id") == "show_more_row": hasmore = True continue link = child.td.strong.a - url = link["href"].encode("us-ascii") + url = link["href"] m = self.rure.search(url) if m is None: raise Exception("Got weird manga URL: %r" % url) id = m.group(1) @@ -191,14 +311,10 @@ class library(lib.library): rure = re.compile(r"/comic/_/([^/]*)$") def search(self, expr): - if not isinstance(expr, unicode): - expr = expr.decode("utf8") - return self._search({"name": expr.encode("utf8"), "name_cond": "c"}) + return self._search({"name": expr, "name_cond": "c"}) def byname(self, prefix): - if not isinstance(prefix, unicode): - prefix = prefix.decode("utf8") - for res in self._search({"name": prefix.encode("utf8"), "name_cond": "s"}): + for res in self._search({"name": prefix, "name_cond": "s"}): if res.name[:len(prefix)].lower() == prefix.lower(): yield res else: @@ -208,5 +324,5 @@ class library(lib.library): break else: if False: - print "eliding " + res.name - print res.altnames() + print("eliding " + res.name) + print(res.altnames()) -- 2.11.0