[automanga.git] / manga / batoto.py

import urllib.request, urllib.parse, http.cookiejar, re, bs4, os
from . import profile, lib, htcache
soup = bs4.BeautifulSoup
soupify = lambda cont: soup(cont, "html.parser")

class pageerror(Exception):
    def __init__(self, message, page):
        super().__init__(message)
        self.page = page

def byclass(el, name, cl):
    for ch in el.findAll(name):
        if not isinstance(ch, bs4.Tag): continue
        cll = ch.get("class", [])
        if cl in cll:
            return ch
    return None

def nextel(el):
    while True:
        el = el.nextSibling
        if isinstance(el, bs4.Tag):
            return el

class page(lib.page):
    def __init__(self, chapter, stack, n, url):
        self.stack = stack
        self.chapter = chapter
        self.n = n
        self.id = str(n)
        self.name = "Page %s" % n
        self.url = url
        self.ciurl = None

    def iurl(self):
        if self.ciurl is None:
            page = soupify(htcache.fetch(self.url))
            img = nextel(page.find("div", id="full_image")).img
            self.ciurl = img["src"]
        return self.ciurl

    def open(self):
        return lib.stdimgstream(self.iurl())

    def __str__(self):
        return self.name

    def __repr(self):
        return "<batoto.page %r.%r.%r>" % (self.chapter.manga.name, self.chapter.name, self.name)

class chapter(lib.pagelist):
    def __init__(self, manga, stack, id, name, url):
        self.stack = stack
        self.manga = manga
        self.id = id
        self.name = name
        self.url = url
        self.cpag = None

    def __getitem__(self, i):
        return self.pages()[i]

    def __len__(self):
        return len(self.pages())

    pnre = re.compile(r"page (\d+)")
    def pages(self):
        if self.cpag is None:
            pg = soupify(htcache.fetch(self.url))
            cpag = []
            for opt in pg.find("select", id="page_select").findAll("option"):
                url = opt["value"]
                n = int(self.pnre.match(opt.string).group(1))
                cpag.append(page(self, self.stack + [(self, len(cpag))], n, url))
            self.cpag = cpag
        return self.cpag

    def __str__(self):
        return self.name

    def __repr__(self):
        return "<batoto.chapter %r.%r>" % (self.manga.name, self.name)

class manga(lib.manga):
    def __init__(self, lib, id, name, url):
        self.lib = lib
        self.sess = lib.sess
        self.id = id
        self.name = name
        self.url = url
        self.cch = None
        self.stack = []
        self.cnames = None

    def __getitem__(self, i):
        return self.ch()[i]

    def __len__(self):
        return len(self.ch())

    @staticmethod
    def vfylogin(page):
        if page.find("div", id="register_notice"):
            return False
        if not byclass(page, "table", "chapters_list"):
            return False
        return True

    cure = re.compile(r"/read/_/(\d+)/[^/]*")
    def ch(self):
        if self.cch is None:
            page = self.sess.lfetch(self.url, self.vfylogin)
            cls = byclass(page, "table", "chapters_list")
            if cls.tbody is not None:
                cls = cls.tbody
            scl = "lang_" + self.lib.lang
            cch = []
            for ch in cls.childGenerator():
                if isinstance(ch, bs4.Tag) and ch.name == "tr":
                    cll = ch.get("class", [])
                    if "row" in cll and scl in cll:
                        url = ch.td.a["href"]
                        m = self.cure.search(url)
                        if m is None: raise pageerror("Got weird chapter URL: %r" % url, page)
                        cid = m.group(1)
                        url = self.lib.base + "read/_/" + cid
                        name = ch.td.a.text
                        cch.append((cid, name, url))
            cch.reverse()
            rch = []
            for n, (cid, name, url) in enumerate(cch):
                rch.append(chapter(self, [(self, n)], cid, name, url))
            self.cch = rch
        return self.cch

    def altnames(self):
        if self.cnames is None:
            page = soupify(self.sess.fetch(self.url))
            cnames = None
            for tbl in page.findAll("table", attrs={"class": "ipb_table"}):
                if tbl.tbody is not None: tbl = tbl.tbody
                for tr in tbl.findAll("tr"):
                    if "Alt Names:" in tr.td.text:
                        nls = nextel(tr.td)
                        if nls.name != "td" or nls.span is None:
                            raise pageerror("Weird altnames table in " + self.id, page)
                        cnames = [nm.text.strip() for nm in nls.findAll("span")]
                        break
                if cnames is not None:
                    break
            if cnames is None:
                raise pageerror("Could not find altnames for " + self.id, page)
            self.cnames = cnames
        return self.cnames

    def __str__(self):
        return self.name

    def __repr__(self):
        return "<batoto.manga %r>" % self.name

class credentials(object):
    def __init__(self, username, password):
        self.username = username
        self.password = password

    @classmethod
    def fromfile(cls, path):
        username, password = None, None
        with open(path) as fp:
            for words in profile.splitlines(fp):
                if words[0] == "username":
                    username = words[1]
                elif words[0] == "password":
                    password = words[1]
                elif words[0] == "pass64":
                    import binascii
                    password = binascii.a2b_base64(words[1]).decode("utf8")
        if None in (username, password):
            raise ValueError("Incomplete profile: " + path)
        return cls(username, password)

    @classmethod
    def default(cls):
        path = os.path.join(profile.confdir, "batoto")
        if os.path.exists(path):
            return cls.fromfile(path)
        return None

class session(object):
    def __init__(self, base, credentials):
        self.base = base
        self.creds = credentials
        self.jar = http.cookiejar.CookieJar()
        self.web = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(self.jar))
        self.loggedin = False

    rlre = re.compile(r"Welcome, (.*) ")
    def dologin(self):
        with self.web.open(self.base) as hs:
            page = soupify(hs.read())

        cur = page.find("a", id="user_link")
        print(0)
        if cur:
            m = self.rlre.search(cur.get_text())
            if not m or m.group(1) != self.creds.username:
                print(1)
                outurl = None
                nav = page.find("div", id="user_navigation")
                if nav:
                    for li in nav.findAll("li"):
                        if li.a and "Sign Out" in li.a.string:
                            outurl = li.a["href"]
                if not outurl:
                    raise pageerror("Could not find logout URL", page)
                with self.wep.open(outurl) as hs:
                    hs.read()
                with self.web.open(self.base) as hs:
                    page = soupify(hs.read())
            else:
                print(2)
                return
        else:
            print(3)

        form = page.find("form", id="login")
        values = {}
        for el in form.findAll("input", type="hidden"):
            values[el["name"]] = el["value"]
        values["ips_username"] = self.creds.username
        values["ips_password"] = self.creds.password
        values["anonymous"] = "1"
        req = urllib.request.Request(form["action"], urllib.parse.urlencode(values).encode("ascii"))
        with self.web.open(req) as hs:
            page = soupify(hs.read())
        for resp in page.findAll("p", attrs={"class": "message"}):
            if resp.strong and "You are now signed in" in resp.strong.string:
                break
        else:
            raise pageerror("Could not log in", page)

    def login(self):
        if not self.loggedin:
            if self.creds:
                self.dologin()
            self.loggedin = True

    def open(self, url):
        return self.web.open(url)

    def fetch(self, url):
        with self.open(url) as hs:
            return hs.read()

    def lfetch(self, url, ck):
        page = soupify(self.fetch(url))
        if not ck(page):
            self.login()
            page = soupify(self.fetch(url))
            if not ck(page):
                raise pageerror("Could not verify login status despite having logged in", page)
        return page

class library(lib.library):
    def __init__(self, *, creds=None):
        if creds is None:
            creds = credentials.default()
        self.base = "http://bato.to/"
        self.sess = session(self.base, creds)
        self.lang = "English"

    def byid(self, id):
        url = self.base + "comic/_/comics/" + id
        page = soupify(self.sess.fetch(url))
        title = page.find("h1", attrs={"class": "ipsType_pagetitle"})
        if title is None:
            raise KeyError(id)
        return manga(self, id, title.string.strip(), url)

    def _search(self, pars):
        p = 1
        while True:
            _pars = dict(pars)
            _pars["p"] = str(p)
            resp = urllib.request.urlopen(self.base + "search?" + urllib.parse.urlencode(_pars).encode("ascii"))
            try:
                page = soupify(resp.read())
            finally:
                resp.close()
            rls = page.find("div", id="comic_search_results").table
            if rls.tbody is not None:
                rls = rls.tbody
            hasmore = False
            for child in rls.findAll("tr"):
                if child.th is not None: continue
                if child.get("id", "")[:11] == "comic_rowo_": continue
                if child.get("id") == "show_more_row":
                    hasmore = True
                    continue
                link = child.td.strong.a
                url = link["href"]
                m = self.rure.search(url)
                if m is None: raise Exception("Got weird manga URL: %r" % url)
                id = m.group(1)
                name = link.text.strip()
                yield manga(self, id, name, url)
            p += 1
            if not hasmore:
                break

    rure = re.compile(r"/comic/_/([^/]*)$")
    def search(self, expr):
        return self._search({"name": expr, "name_cond": "c"})

    def byname(self, prefix):
        for res in self._search({"name": prefix, "name_cond": "s"}):
            if res.name[:len(prefix)].lower() == prefix.lower():
                yield res
            else:
                for aname in res.altnames():
                    if aname[:len(prefix)].lower() == prefix.lower():
                        yield manga(self, res.id, aname, res.url)
                        break
                else:
                    if False:
                        print("eliding " + res.name)
                        print(res.altnames())
Commit	Line	Data
81be6921 FT	1	import urllib.request, urllib.parse, http.cookiejar, re, bs4, os
	2	from . import profile, lib, htcache
	3	soup = bs4.BeautifulSoup
	4	soupify = lambda cont: soup(cont, "html.parser")
	5
	6	class pageerror(Exception):
	7	def __init__(self, message, page):
	8	super().__init__(message)
	9	self.page = page
08e259d7 FT	10
	11	def byclass(el, name, cl):
	12	for ch in el.findAll(name):
81be6921 FT	13	if not isinstance(ch, bs4.Tag): continue
	14	cll = ch.get("class", [])
	15	if cl in cll:
08e259d7 FT	16	return ch
	17	return None
	18
	19	def nextel(el):
	20	while True:
	21	el = el.nextSibling
81be6921	22	if isinstance(el, bs4.Tag):
08e259d7 FT	23	return el
	24
	25	class page(lib.page):
	26	def __init__(self, chapter, stack, n, url):
	27	self.stack = stack
	28	self.chapter = chapter
	29	self.n = n
	30	self.id = str(n)
81be6921	31	self.name = "Page %s" % n
08e259d7 FT	32	self.url = url
	33	self.ciurl = None
	34
	35	def iurl(self):
	36	if self.ciurl is None:
c0d3b1a2	37	page = soupify(htcache.fetch(self.url))
08e259d7	38	img = nextel(page.find("div", id="full_image")).img
81be6921	39	self.ciurl = img["src"]
08e259d7 FT	40	return self.ciurl
	41
	42	def open(self):
	43	return lib.stdimgstream(self.iurl())
	44
	45	def __str__(self):
	46	return self.name
	47
	48	def __repr(self):
	49	return "<batoto.page %r.%r.%r>" % (self.chapter.manga.name, self.chapter.name, self.name)
	50
	51	class chapter(lib.pagelist):
	52	def __init__(self, manga, stack, id, name, url):
	53	self.stack = stack
	54	self.manga = manga
	55	self.id = id
	56	self.name = name
	57	self.url = url
	58	self.cpag = None
	59
	60	def __getitem__(self, i):
	61	return self.pages()[i]
	62
	63	def __len__(self):
	64	return len(self.pages())
	65
	66	pnre = re.compile(r"page (\d+)")
	67	def pages(self):
	68	if self.cpag is None:
c0d3b1a2	69	pg = soupify(htcache.fetch(self.url))
08e259d7 FT	70	cpag = []
08e259d7 FT	71	for opt in pg.find("select", id="page_select").findAll("option"):
81be6921	72	url = opt["value"]
08e259d7 FT	73	n = int(self.pnre.match(opt.string).group(1))
	74	cpag.append(page(self, self.stack + [(self, len(cpag))], n, url))
	75	self.cpag = cpag
	76	return self.cpag
	77
	78	def __str__(self):
	79	return self.name
	80
	81	def __repr__(self):
	82	return "<batoto.chapter %r.%r>" % (self.manga.name, self.name)
	83
	84	class manga(lib.manga):
	85	def __init__(self, lib, id, name, url):
	86	self.lib = lib
81be6921	87	self.sess = lib.sess
08e259d7 FT	88	self.id = id
	89	self.name = name
	90	self.url = url
	91	self.cch = None
	92	self.stack = []
1043cbdb	93	self.cnames = None
08e259d7 FT	94
	95	def __getitem__(self, i):
	96	return self.ch()[i]
	97
	98	def __len__(self):
	99	return len(self.ch())
	100
81be6921 FT	101	@staticmethod
	102	def vfylogin(page):
	103	if page.find("div", id="register_notice"):
	104	return False
	105	if not byclass(page, "table", "chapters_list"):
	106	return False
	107	return True
	108
08e259d7 FT	109	cure = re.compile(r"/read/_/(\d+)/[^/]*")
	110	def ch(self):
	111	if self.cch is None:
81be6921 FT	112	page = self.sess.lfetch(self.url, self.vfylogin)
81be6921 FT	113	cls = byclass(page, "table", "chapters_list")
08e259d7 FT	114	if cls.tbody is not None:
08e259d7 FT	115	cls = cls.tbody
81be6921	116	scl = "lang_" + self.lib.lang
08e259d7 FT	117	cch = []
08e259d7 FT	118	for ch in cls.childGenerator():
81be6921 FT	119	if isinstance(ch, bs4.Tag) and ch.name == "tr":
	120	cll = ch.get("class", [])
	121	if "row" in cll and scl in cll:
	122	url = ch.td.a["href"]
08e259d7	123	m = self.cure.search(url)
81be6921	124	if m is None: raise pageerror("Got weird chapter URL: %r" % url, page)
08e259d7 FT	125	cid = m.group(1)
	126	url = self.lib.base + "read/_/" + cid
	127	name = ch.td.a.text
687f2ed3	128	cch.append((cid, name, url))
08e259d7	129	cch.reverse()
687f2ed3 FT	130	rch = []
	131	for n, (cid, name, url) in enumerate(cch):
	132	rch.append(chapter(self, [(self, n)], cid, name, url))
	133	self.cch = rch
08e259d7 FT	134	return self.cch
08e259d7 FT	135
1043cbdb FT	136	def altnames(self):
1043cbdb FT	137	if self.cnames is None:
81be6921	138	page = soupify(self.sess.fetch(self.url))
1043cbdb FT	139	cnames = None
	140	for tbl in page.findAll("table", attrs={"class": "ipb_table"}):
	141	if tbl.tbody is not None: tbl = tbl.tbody
	142	for tr in tbl.findAll("tr"):
81be6921	143	if "Alt Names:" in tr.td.text:
1043cbdb	144	nls = nextel(tr.td)
81be6921 FT	145	if nls.name != "td" or nls.span is None:
81be6921 FT	146	raise pageerror("Weird altnames table in " + self.id, page)
1043cbdb FT	147	cnames = [nm.text.strip() for nm in nls.findAll("span")]
	148	break
	149	if cnames is not None:
	150	break
	151	if cnames is None:
81be6921	152	raise pageerror("Could not find altnames for " + self.id, page)
1043cbdb FT	153	self.cnames = cnames
	154	return self.cnames
	155
08e259d7 FT	156	def __str__(self):
	157	return self.name
	158
	159	def __repr__(self):
	160	return "<batoto.manga %r>" % self.name
	161
81be6921 FT	162	class credentials(object):
	163	def __init__(self, username, password):
	164	self.username = username
	165	self.password = password
	166
	167	@classmethod
	168	def fromfile(cls, path):
	169	username, password = None, None
	170	with open(path) as fp:
	171	for words in profile.splitlines(fp):
	172	if words[0] == "username":
	173	username = words[1]
	174	elif words[0] == "password":
	175	password = words[1]
	176	elif words[0] == "pass64":
	177	import binascii
	178	password = binascii.a2b_base64(words[1]).decode("utf8")
	179	if None in (username, password):
	180	raise ValueError("Incomplete profile: " + path)
	181	return cls(username, password)
	182
	183	@classmethod
	184	def default(cls):
	185	path = os.path.join(profile.confdir, "batoto")
	186	if os.path.exists(path):
	187	return cls.fromfile(path)
	188	return None
	189
	190	class session(object):
	191	def __init__(self, base, credentials):
	192	self.base = base
	193	self.creds = credentials
	194	self.jar = http.cookiejar.CookieJar()
	195	self.web = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(self.jar))
	196	self.loggedin = False
	197
	198	rlre = re.compile(r"Welcome, (.*) ")
	199	def dologin(self):
	200	with self.web.open(self.base) as hs:
	201	page = soupify(hs.read())
	202
	203	cur = page.find("a", id="user_link")
	204	print(0)
	205	if cur:
	206	m = self.rlre.search(cur.get_text())
	207	if not m or m.group(1) != self.creds.username:
	208	print(1)
	209	outurl = None
	210	nav = page.find("div", id="user_navigation")
	211	if nav:
	212	for li in nav.findAll("li"):
	213	if li.a and "Sign Out" in li.a.string:
	214	outurl = li.a["href"]
	215	if not outurl:
	216	raise pageerror("Could not find logout URL", page)
	217	with self.wep.open(outurl) as hs:
	218	hs.read()
	219	with self.web.open(self.base) as hs:
	220	page = soupify(hs.read())
	221	else:
	222	print(2)
	223	return
	224	else:
	225	print(3)
226
227	form = page.find("form", id="login")
228	values = {}
229	for el in form.findAll("input", type="hidden"):
230	values[el["name"]] = el["value"]
231	values["ips_username"] = self.creds.username
232	values["ips_password"] = self.creds.password
233	values["anonymous"] = "1"
234	req = urllib.request.Request(form["action"], urllib.parse.urlencode(values).encode("ascii"))
235	with self.web.open(req) as hs:
236	page = soupify(hs.read())
237	for resp in page.findAll("p", attrs={"class": "message"}):
238	if resp.strong and "You are now signed in" in resp.strong.string:
239	break
240	else:
241	raise pageerror("Could not log in", page)
242
243	def login(self):
244	if not self.loggedin:
245	if self.creds:
246	self.dologin()
247	self.loggedin = True
248
249	def open(self, url):
250	return self.web.open(url)
251
252	def fetch(self, url):
253	with self.open(url) as hs:
254	return hs.read()
255
256	def lfetch(self, url, ck):
257	page = soupify(self.fetch(url))
258	if not ck(page):
259	self.login()
260	page = soupify(self.fetch(url))
261	if not ck(page):
262	raise pageerror("Could not verify login status despite having logged in", page)
263	return page
264
08e259d7	265	class library(lib.library):
81be6921 FT	266	def __init__(self, *, creds=None):
	267	if creds is None:
	268	creds = credentials.default()
	269	self.base = "http://bato.to/"
	270	self.sess = session(self.base, creds)
	271	self.lang = "English"
08e259d7 FT	272
	273	def byid(self, id):
	274	url = self.base + "comic/_/comics/" + id
81be6921	275	page = soupify(self.sess.fetch(url))
08e259d7 FT	276	title = page.find("h1", attrs={"class": "ipsType_pagetitle"})
	277	if title is None:
	278	raise KeyError(id)
	279	return manga(self, id, title.string.strip(), url)
	280
24f0a3b7	281	def _search(self, pars):
1043cbdb FT	282	p = 1
1043cbdb FT	283	while True:
24f0a3b7 FT	284	_pars = dict(pars)
24f0a3b7 FT	285	_pars["p"] = str(p)
81be6921	286	resp = urllib.request.urlopen(self.base + "search?" + urllib.parse.urlencode(_pars).encode("ascii"))
1043cbdb	287	try:
c0d3b1a2	288	page = soupify(resp.read())
1043cbdb FT	289	finally:
	290	resp.close()
	291	rls = page.find("div", id="comic_search_results").table
	292	if rls.tbody is not None:
	293	rls = rls.tbody
	294	hasmore = False
	295	for child in rls.findAll("tr"):
	296	if child.th is not None: continue
81be6921 FT	297	if child.get("id", "")[:11] == "comic_rowo_": continue
81be6921 FT	298	if child.get("id") == "show_more_row":
1043cbdb FT	299	hasmore = True
	300	continue
	301	link = child.td.strong.a
81be6921	302	url = link["href"]
1043cbdb FT	303	m = self.rure.search(url)
	304	if m is None: raise Exception("Got weird manga URL: %r" % url)
	305	id = m.group(1)
	306	name = link.text.strip()
1043cbdb FT	307	yield manga(self, id, name, url)
	308	p += 1
	309	if not hasmore:
	310	break
24f0a3b7 FT	311
	312	rure = re.compile(r"/comic/_/([^/]*)$")
	313	def search(self, expr):
81be6921	314	return self._search({"name": expr, "name_cond": "c"})
24f0a3b7 FT	315
24f0a3b7 FT	316	def byname(self, prefix):
81be6921	317	for res in self._search({"name": prefix, "name_cond": "s"}):
24f0a3b7 FT	318	if res.name[:len(prefix)].lower() == prefix.lower():
	319	yield res
	320	else:
	321	for aname in res.altnames():
	322	if aname[:len(prefix)].lower() == prefix.lower():
	323	yield manga(self, res.id, aname, res.url)
	324	break
	325	else:
	326	if False:
81be6921 FT	327	print("eliding " + res.name)
81be6921 FT	328	print(res.altnames())