X-Git-Url: http://dolda2000.com/gitweb/?a=blobdiff_plain;f=manga%2Fbatoto.py;h=4343a0597a4ed4da293ed663b6a2288618e84dd2;hb=ebc277d35244dd94829cd19c3aadd09cc340384b;hp=cab29b5f56123da274c5314072fb1b82216222f2;hpb=c72e69b8c6044cf4393b496bcfe73700698f4fa6;p=automanga.git diff --git a/manga/batoto.py b/manga/batoto.py index cab29b5..4343a05 100644 --- a/manga/batoto.py +++ b/manga/batoto.py @@ -1,36 +1,65 @@ -import urllib, re, BeautifulSoup -import lib, htcache -soup = BeautifulSoup.BeautifulSoup +import urllib.request, urllib.parse, http.cookiejar, re, bs4, os, time +from . import profile, lib, htcache +soup = bs4.BeautifulSoup +soupify = lambda cont: soup(cont, "html.parser") + +class pageerror(Exception): + def __init__(self, message, page): + super().__init__(message) + self.page = page + +def iterlast(itr, default=None): + if default is not None: + ret = default + try: + while True: + ret = next(itr) + except StopIteration: + return ret + +def find1(el, *args, **kwargs): + ret = el.find(*args, **kwargs) + if ret is None: + raise pageerror("could not find expected element", iterlast(el.parents, el)) + return ret def byclass(el, name, cl): for ch in el.findAll(name): - if not isinstance(ch, BeautifulSoup.Tag): continue - cll = ch.get("class", "") - if cl in cll.split(): + if not isinstance(ch, bs4.Tag): continue + cll = ch.get("class", []) + if cl in cll: return ch return None def nextel(el): while True: el = el.nextSibling - if isinstance(el, BeautifulSoup.Tag): + if isinstance(el, bs4.Tag): return el +def fetchreader(lib, readerid, page): + pg = soupify(lib.sess.fetch(lib.base + "areader?" + urllib.parse.urlencode({"id": readerid, + "p": str(page), + "supress_webtoon": "t"}), + headers={"Referer": "http://bato.to/reader"})) + return pg + class page(lib.page): - def __init__(self, chapter, stack, n, url): + def __init__(self, chapter, stack, readerid, n): self.stack = stack + self.lib = chapter.lib self.chapter = chapter self.n = n self.id = str(n) - self.name = u"Page %s" % n - self.url = url + self.name = "Page %s" % n + self.readerid = readerid self.ciurl = None def iurl(self): if self.ciurl is None: - page = soup(htcache.fetch(self.url)) - img = nextel(page.find("div", id="full_image")).img - self.ciurl = img["src"].encode("us-ascii") + page = fetchreader(self.lib, self.readerid, self.n) + img = find1(page, "img", id="comic_page") + self.ciurl = img["src"] return self.ciurl def open(self): @@ -40,15 +69,17 @@ class page(lib.page): return self.name def __repr(self): - return "" % (self.chapter.manga.name, self.chapter.name, self.name) + return "" % (self.chapter.manga.name, self.chapter.group.name, self.chapter.name, self.name) class chapter(lib.pagelist): - def __init__(self, manga, stack, id, name, url): + def __init__(self, group, stack, id, name, readerid): self.stack = stack - self.manga = manga + self.group = group + self.manga = group.manga + self.lib = self.manga.lib self.id = id self.name = name - self.url = url + self.readerid = readerid self.cpag = None def __getitem__(self, i): @@ -60,12 +91,11 @@ class chapter(lib.pagelist): pnre = re.compile(r"page (\d+)") def pages(self): if self.cpag is None: - pg = soup(htcache.fetch(self.url)) + pg = fetchreader(self.lib, self.readerid, 1) cpag = [] - for opt in pg.find("select", id="page_select").findAll("option"): - url = opt["value"].encode("us-ascii") + for opt in find1(pg, "select", id="page_select").findAll("option"): n = int(self.pnre.match(opt.string).group(1)) - cpag.append(page(self, self.stack + [(self, len(cpag))], n, url)) + cpag.append(page(self, self.stack + [(self, len(cpag))], self.readerid, n)) self.cpag = cpag return self.cpag @@ -73,16 +103,38 @@ class chapter(lib.pagelist): return self.name def __repr__(self): - return "" % (self.manga.name, self.name) + return "" % (self.manga.name, self.group.name, self.name) + +class group(lib.pagelist): + def __init__(self, manga, stack, id, name): + self.stack = stack + self.manga = manga + self.id = id + self.name = name + self.ch = [] + + def __getitem__(self, i): + return self.ch[i] + + def __len__(self): + return len(self.ch) + + def __str__(self): + return self.name + + def __repr__(self): + return "" % self.name +class credentials(object): + def __init__(self, username, password): + self.username = username + self.password = password + + @classmethod + def fromfile(cls, path): + username, password = None, None + with open(path) as fp: + for words in profile.splitlines(fp): + if words[0] == "username": + username = words[1] + elif words[0] == "password": + password = words[1] + elif words[0] == "pass64": + import binascii + password = binascii.a2b_base64(words[1]).decode("utf8") + if None in (username, password): + raise ValueError("Incomplete profile: " + path) + return cls(username, password) + + @classmethod + def default(cls): + path = os.path.join(profile.confdir, "batoto") + if os.path.exists(path): + return cls.fromfile(path) + return None + +class session(object): + def __init__(self, base, credentials): + self.base = base + self.creds = credentials + self.jar = http.cookiejar.CookieJar() + self.web = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(self.jar)) + self.lastlogin = 0 + + rlre = re.compile(r"Welcome, (.*) ") + def dologin(self, pre=None): + now = time.time() + if now - self.lastlogin < 60: + raise Exception("Too soon since last login attempt") + if pre is None: + with self.web.open(self.base) as hs: + page = soupify(hs.read()) + else: + page = pre + + cur = page.find("a", id="user_link") + if cur: + m = self.rlre.search(cur.text) + if not m or m.group(1) != self.creds.username: + outurl = None + nav = page.find("div", id="user_navigation") + if nav: + for li in nav.findAll("li"): + if li.a and "Sign Out" in li.a.string: + outurl = li.a["href"] + if not outurl: + raise pageerror("Could not find logout URL", page) + with self.wep.open(outurl) as hs: + hs.read() + with self.web.open(self.base) as hs: + page = soupify(hs.read()) + else: + return + else: + + form = page.find("form", id="login") + if not form and pre: + return self.dologin() + values = {} + for el in form.findAll("input", type="hidden"): + values[el["name"]] = el["value"] + values["ips_username"] = self.creds.username + values["ips_password"] = self.creds.password + values["rememberMe"] = "1" + values["anonymous"] = "1" + req = urllib.request.Request(form["action"], urllib.parse.urlencode(values).encode("ascii")) + with self.web.open(req) as hs: + page = soupify(hs.read()) + for resp in page.findAll("p", attrs={"class": "message"}): + if resp.strong and "You are now signed in" in resp.strong.string: + break + else: + raise pageerror("Could not log in", page) + self.lastlogin = now + + def open(self, url): + return self.web.open(url) + + def fetch(self, url, headers=None): + req = urllib.request.Request(url) + if headers is not None: + for k, v in headers.items(): + req.add_header(k, v) + with self.open(req) as hs: + return hs.read() + + def lfetch(self, url, ck): + page = soupify(self.fetch(url)) + if not ck(page): + self.dologin(pre=page) + page = soupify(self.fetch(url)) + if not ck(page): + raise pageerror("Could not verify login status despite having logged in", page) + return page + class library(lib.library): - def __init__(self): - self.base = "http://www.batoto.net/" - self.lang = u"English" + def __init__(self, *, creds=None): + if creds is None: + creds = credentials.default() + self.base = "http://bato.to/" + self.sess = session(self.base, creds) + self.lang = "English" def byid(self, id): url = self.base + "comic/_/comics/" + id - page = soup(htcache.fetch(url)) + page = soupify(self.sess.fetch(url)) title = page.find("h1", attrs={"class": "ipsType_pagetitle"}) if title is None: raise KeyError(id) return manga(self, id, title.string.strip(), url) - mure = re.compile(r"/comic/_/comics/([^/]*)$") - def search(self, expr): - resp = urllib.urlopen(self.base + "forums/index.php?app=core&module=search&do=search&fromMainBar=1", - urllib.urlencode({"search_term": expr, "search_app": "ccs:database:3"})) - try: - page = soup(resp.read()) - finally: - resp.close() - none = page.find("p", attrs={"class": "no_messages"}) - if none is not None and u"No results" in none.text: - return [] - ret = [] - for child in page.find("div", id="search_results").ol.childGenerator(): - if isinstance(child, BeautifulSoup.Tag) and child.name == u"li": - info = child.find("div", attrs={"class": "result_info"}) - url = info.h3.a["href"].encode("us-ascii") - m = self.mure.search(url) + def _search(self, pars): + p = 1 + while True: + _pars = dict(pars) + _pars["p"] = str(p) + resp = urllib.request.urlopen(self.base + "search?" + urllib.parse.urlencode(_pars)) + try: + page = soupify(resp.read()) + finally: + resp.close() + rls = page.find("div", id="comic_search_results").table + if rls.tbody is not None: + rls = rls.tbody + hasmore = False + for child in rls.findAll("tr"): + if child.th is not None: continue + if child.get("id", "")[:11] == "comic_rowo_": continue + if child.get("id") == "show_more_row": + hasmore = True + continue + link = child.td.strong.a + url = link["href"] + m = self.rure.search(url) if m is None: raise Exception("Got weird manga URL: %r" % url) id = m.group(1) - name = info.h3.a.string.strip() - ret.append(manga(self, id, name, url)) - return ret + name = link.text.strip() + yield manga(self, id, name, url) + p += 1 + if not hasmore: + break + + rure = re.compile(r"/comic/_/([^/]*)$") + def search(self, expr): + return self._search({"name": expr, "name_cond": "c"}) + + def byname(self, prefix): + for res in self._search({"name": prefix, "name_cond": "s"}): + if res.name[:len(prefix)].lower() == prefix.lower(): + yield res + else: + for aname in res.altnames(): + if aname[:len(prefix)].lower() == prefix.lower(): + yield manga(self, res.id, aname, res.url) + break + else: + if False: + print("eliding " + res.name) + print(res.altnames())