From: Fredrik Tolf Date: Wed, 29 Feb 2012 21:26:17 +0000 (+0100) Subject: Added more persistent string IDs to the various pagetree nodes. X-Git-Url: http://dolda2000.com/gitweb/?p=automanga.git;a=commitdiff_plain;h=46b3b90eef4007f3f4e871afd4854f7a06c8bfc8 Added more persistent string IDs to the various pagetree nodes. --- diff --git a/manga/lib.py b/manga/lib.py index d976993..52f75ea 100644 --- a/manga/lib.py +++ b/manga/lib.py @@ -9,6 +9,13 @@ class library(object): All libraries should implement this.""" raise NotImplementedError() + def byid(self, id): + """Returns a previously known manga by its string ID, or + raises KeyError if no such manga could be found. + + All libraries should implement this.""" + raise KeyError(id) + def __iter__(self): """Return an iterator of all known mangas in this library. @@ -18,13 +25,30 @@ class library(object): class pagetree(object): """Base class for objects in the tree of pages and pagelists. - All pagetree objects should contain an attribute `stack', contains - a list of pairs. The last pair in the list should be the pagetree - object which yielded this pagetree object, along with the index - which yielded it. Every non-last pair should be the same + All pagetree objects should contain an attribute `stack', + containing a list of pairs. The last pair in the list should be + the pagetree object which yielded this pagetree object, along with + the index which yielded it. Every non-last pair should be the same information for the pair following it. The only objects with empty - `stack' lists should be `manga' objects.""" - pass + `stack' lists should be `manga' objects. + + All non-root pagetree objects should also contain an attribute + `id', which should be a string that can be passed to the `byid' + function of its parent node to recover the node. Such string ID + should be more persistent than the node's numeric index in the + parent.""" + + def idlist(self): + """Returns a list of the IDs necessary to resolve this node + from the root node.""" + if len(self.stack) == 0: + raise Exception("Cannot get ID list on root node.") + return [n.id for n, i in self.stack[1:]] + [self.id] + + def byidlist(self, idlist): + if len(idlist) == 0: + return self + return self.byid(idlist[0]).byidlist(idlist[1:]) class pagelist(pagetree): """Class representing a list of either pages, or nested @@ -49,9 +73,25 @@ class pagelist(pagetree): All pagelists need to implement this.""" raise NotImplementedError() + def byid(self, id): + """Return the direct sub-node of this pagelist which has the + given string ID. If none is found, a KeyError is raised. + + This default method iterates the children of this node, but + may be overridden by some more efficient implementation. + """ + for ch in self: + if ch.id == id: + return ch + raise KeyError(id) + class manga(pagelist): """Class reprenting a single manga. Includes the pagelist class, - and all constraints valid for it.""" + and all constraints valid for it. + + A manga is a root pagetree node, but should also contain an `id' + attribute, which can be used to recover the manga from its + library's `byid' function.""" pass class page(pagetree): diff --git a/manga/mangafox.py b/manga/mangafox.py index cb28944..8c23630 100644 --- a/manga/mangafox.py +++ b/manga/mangafox.py @@ -24,6 +24,7 @@ class page(lib.page): self.volume = self.chapter.volume self.manga = self.volume.manga self.n = n + self.id = str(n) self.url = url self.ciurl = None @@ -37,10 +38,11 @@ class page(lib.page): return imgstream(self.iurl()) class chapter(lib.pagelist): - def __init__(self, volume, stack, name, url): + def __init__(self, volume, stack, id, name, url): self.stack = stack self.volume = volume self.manga = volume.manga + self.id = id self.name = name self.url = url self.cpag = None @@ -70,9 +72,10 @@ class chapter(lib.pagelist): return "" % (self.manga.name, self.volume.name, self.name) class volume(lib.pagelist): - def __init__(self, manga, stack, name): + def __init__(self, manga, stack, id, name): self.stack = stack self.manga = manga + self.id = id self.name = name self.ch = [] @@ -95,8 +98,9 @@ def nextel(el): return el class manga(lib.manga): - def __init__(self, lib, name, url): + def __init__(self, lib, id, name, url): self.lib = lib + self.id = id self.name = name self.url = url self.cvol = None @@ -114,13 +118,16 @@ class manga(lib.manga): vls = page.find("div", id="chapters").findAll("div", attrs={"class": "slide"}) self.cvol = [] for i, vn in enumerate(reversed(vls)): - vol = volume(self, [(self, i)], vn.find("h3", attrs={"class": "volume"}).contents[0].strip()) + name = vn.find("h3", attrs={"class": "volume"}).contents[0].strip() + vid = name.encode("utf8") + vol = volume(self, [(self, i)], vid, name) cls = nextel(vn) if cls.name != u"ul" or cls["class"] != u"chlist": raise Exception("parse error: weird volume list for %r" % self) for o, ch in enumerate(reversed(cls.findAll("li"))): n = ch.div.h3 or ch.div.h4 name = n.a.string + chid = name.encode("utf8") for span in ch("span"): try: if u" title " in (u" " + span["class"] + u" "): @@ -130,7 +137,7 @@ class manga(lib.manga): url = n.a["href"].encode("us-ascii") if url[-7:] != "/1.html": raise Exception("parse error: unexpected chapter URL for %r: %s" % (self, url)) - vol.ch.append(chapter(vol, vol.stack + [(vol, o)], name, url[:-6])) + vol.ch.append(chapter(vol, vol.stack + [(vol, o)], chid, name, url[:-6])) self.cvol.append(vol) return self.cvol @@ -151,11 +158,14 @@ class library(lib.library): page = soup(htcache.fetch(self.base + ("directory/%i.htm?az" % pno))) ls = page.find("div", id="mangalist").find("ul", attrs={"class": "list"}).findAll("li") ret = [] + ubase = self.base + "manga/" for m in ls: t = m.find("div", attrs={"class": "manga_text"}).find("a", attrs={"class": "title"}) name = t.string url = t["href"].encode("us-ascii") - ret.append(manga(self, name, url)) + if url[:len(ubase)] != ubase or url.find('/', len(ubase)) != (len(url) - 1): + raise Exception("parse error: unexpected manga URL for %r: %s" % (name, url)) + ret.append(manga(self, url[len(ubase):-1], name, url)) return ret def alphapages(self): @@ -197,5 +207,14 @@ class library(lib.library): ls = self.alphapage(pno) i = 0 + def byid(self, id): + url = self.base + ("manga/%s/" % id) + page = soup(htcache.fetch(url)) + if page.find("div", id="title") is None: + # Assume we got the search page + raise KeyError(id) + name = page.find("div", id="series_info").find("div", attrs={"class": "cover"}).img["alt"] + return manga(self, id, name, url) + def __iter__(self): raise NotImplementedError("mangafox iterator")