From c0d3b1a2e6671763efcce90b089bc0f9100e8d2f Mon Sep 17 00:00:00 2001 From: Fredrik Tolf Date: Fri, 23 May 2014 00:15:38 +0200 Subject: [PATCH] Decode HTML entities correctly. --- manga/batoto.py | 13 +++++++------ manga/mangafox.py | 13 +++++++------ manga/mrnet.py | 11 ++++++----- manga/rawsen.py | 11 ++++++----- 4 files changed, 26 insertions(+), 22 deletions(-) diff --git a/manga/batoto.py b/manga/batoto.py index 8593e04..baa602f 100644 --- a/manga/batoto.py +++ b/manga/batoto.py @@ -1,6 +1,7 @@ import urllib, re, BeautifulSoup import lib, htcache soup = BeautifulSoup.BeautifulSoup +soupify = lambda cont: soup(cont, convertEntities=soup.HTML_ENTITIES) def byclass(el, name, cl): for ch in el.findAll(name): @@ -28,7 +29,7 @@ class page(lib.page): def iurl(self): if self.ciurl is None: - page = soup(htcache.fetch(self.url)) + page = soupify(htcache.fetch(self.url)) img = nextel(page.find("div", id="full_image")).img self.ciurl = img["src"].encode("us-ascii") return self.ciurl @@ -60,7 +61,7 @@ class chapter(lib.pagelist): pnre = re.compile(r"page (\d+)") def pages(self): if self.cpag is None: - pg = soup(htcache.fetch(self.url)) + pg = soupify(htcache.fetch(self.url)) cpag = [] for opt in pg.find("select", id="page_select").findAll("option"): url = opt["value"].encode("us-ascii") @@ -94,7 +95,7 @@ class manga(lib.manga): cure = re.compile(r"/read/_/(\d+)/[^/]*") def ch(self): if self.cch is None: - page = soup(htcache.fetch(self.url)) + page = soupify(htcache.fetch(self.url)) cls = byclass(page, u"table", u"chapters_list") if cls.tbody is not None: cls = cls.tbody @@ -120,7 +121,7 @@ class manga(lib.manga): def altnames(self): if self.cnames is None: - page = soup(htcache.fetch(self.url)) + page = soupify(htcache.fetch(self.url)) cnames = None for tbl in page.findAll("table", attrs={"class": "ipb_table"}): if tbl.tbody is not None: tbl = tbl.tbody @@ -151,7 +152,7 @@ class library(lib.library): def byid(self, id): url = self.base + "comic/_/comics/" + id - page = soup(htcache.fetch(url)) + page = soupify(htcache.fetch(url)) title = page.find("h1", attrs={"class": "ipsType_pagetitle"}) if title is None: raise KeyError(id) @@ -164,7 +165,7 @@ class library(lib.library): _pars["p"] = str(p) resp = urllib.urlopen(self.base + "search?" + urllib.urlencode(_pars)) try: - page = soup(resp.read()) + page = soupify(resp.read()) finally: resp.close() rls = page.find("div", id="comic_search_results").table diff --git a/manga/mangafox.py b/manga/mangafox.py index 9831a81..ef84eb0 100644 --- a/manga/mangafox.py +++ b/manga/mangafox.py @@ -2,6 +2,7 @@ import urllib, re import BeautifulSoup, json import lib, htcache soup = BeautifulSoup.BeautifulSoup +soupify = lambda cont: soup(cont, convertEntities=soup.HTML_ENTITIES) class page(lib.page): def __init__(self, chapter, stack, n, url): @@ -17,7 +18,7 @@ class page(lib.page): def iurl(self): if self.ciurl is None: - page = soup(htcache.fetch(self.url)) + page = soupify(htcache.fetch(self.url)) self.ciurl = page.find("div", id="viewer").find("img", id="image")["src"] return self.ciurl @@ -48,7 +49,7 @@ class chapter(lib.pagelist): def pages(self): if self.cpag is None: - pg = soup(htcache.fetch(self.url + "1.html")) + pg = soupify(htcache.fetch(self.url + "1.html")) l = pg.find("form", id="top_bar").find("div", attrs={"class": "l"}) if len(l.contents) != 3: raise Exception("parse error: weird page list for %r" % self) @@ -109,7 +110,7 @@ class manga(lib.manga): def vols(self): if self.cvol is None: - page = soup(htcache.fetch(self.url)) + page = soupify(htcache.fetch(self.url)) vls = page.find("div", id="chapters").findAll("div", attrs={"class": "slide"}) cvol = [] for i, vn in enumerate(reversed(vls)): @@ -155,7 +156,7 @@ class library(lib.library): self.base = "http://mangafox.me/" def alphapage(self, pno): - page = soup(htcache.fetch(self.base + ("directory/%i.htm?az" % pno))) + page = soupify(htcache.fetch(self.base + ("directory/%i.htm?az" % pno))) ls = page.find("div", id="mangalist").find("ul", attrs={"class": "list"}).findAll("li") ret = [] ubase = self.base + "manga/" @@ -169,7 +170,7 @@ class library(lib.library): return ret def alphapages(self): - page = soup(htcache.fetch(self.base + "directory/?az")) + page = soupify(htcache.fetch(self.base + "directory/?az")) ls = page.find("div", id="mangalist").find("div", id="nav").find("ul").findAll("li") return int(ls[-2].find("a").string) @@ -217,7 +218,7 @@ class library(lib.library): def byid(self, id): url = self.base + ("manga/%s/" % id) - page = soup(htcache.fetch(url)) + page = soupify(htcache.fetch(url)) if page.find("div", id="title") is None: # Assume we got the search page raise KeyError(id) diff --git a/manga/mrnet.py b/manga/mrnet.py index 1439f09..0505296 100644 --- a/manga/mrnet.py +++ b/manga/mrnet.py @@ -1,6 +1,7 @@ import BeautifulSoup, urlparse import lib, htcache soup = BeautifulSoup.BeautifulSoup +soupify = lambda cont: soup(cont, convertEntities=soup.HTML_ENTITIES) class page(lib.page): def __init__(self, chapter, stack, n, url): @@ -15,7 +16,7 @@ class page(lib.page): def iurl(self): if self.ciurl is None: - page = soup(htcache.fetch(self.url)) + page = soupify(htcache.fetch(self.url)) self.ciurl = page.find("div", id="imgholder").find("img", id="img")["src"].encode("us-ascii") return self.ciurl @@ -45,7 +46,7 @@ class chapter(lib.pagelist): def pages(self): if self.cpag is None: - pg = soup(htcache.fetch(self.url)) + pg = soupify(htcache.fetch(self.url)) pag = [] for opt in pg.find("div", id="selectpage").find("select", id="pageMenu").findAll("option"): url = urlparse.urljoin(self.url, opt["value"].encode("us-ascii")) @@ -77,7 +78,7 @@ class manga(lib.manga): def ch(self): if self.cch is None: - page = soup(htcache.fetch(self.url)) + page = soupify(htcache.fetch(self.url)) cls = page.find("div", id="chapterlist").find("table", id="listing") i = 0 cch = [] @@ -108,14 +109,14 @@ class library(lib.library): def byid(self, id): url = self.base + id - page = soup(htcache.fetch(url)) + page = soupify(htcache.fetch(url)) if page.find("h2", attrs={"class": "aname"}) is None: raise KeyError(id) name = page.find("h2", attrs={"class": "aname"}).string return manga(self, id, name, url) def __iter__(self): - page = soup(htcache.fetch(self.base + "alphabetical")) + page = soupify(htcache.fetch(self.base + "alphabetical")) for sec in page.findAll("div", attrs={"class": "series_alpha"}): for li in sec.find("ul", attrs={"class": "series_alpha"}).findAll("li"): url = li.a["href"].encode("us-ascii") diff --git a/manga/rawsen.py b/manga/rawsen.py index 214a4b8..a87d52e 100644 --- a/manga/rawsen.py +++ b/manga/rawsen.py @@ -1,6 +1,7 @@ import BeautifulSoup, urlparse import lib, htcache soup = BeautifulSoup.BeautifulSoup +soupify = lambda cont: soup(cont, convertEntities=soup.HTML_ENTITIES) class page(lib.page): def __init__(self, chapter, stack, n, url): @@ -15,7 +16,7 @@ class page(lib.page): def iurl(self): if self.ciurl is None: - page = soup(htcache.fetch(self.url)) + page = soupify(htcache.fetch(self.url)) for tr in page.findAll("tr"): img = tr.find("img", id="picture") if img is not None: @@ -53,7 +54,7 @@ class chapter(lib.pagelist): if self.url[-2:] != "/1": raise Exception("parse error: unexpected first page url for %r" % self) base = self.url[:-1] - pg = soup(htcache.fetch(self.url)) + pg = soupify(htcache.fetch(self.url)) pag = [] for opt in pg.find("div", attrs={"class": "pager"}).find("select", attrs={"name": "page"}).findAll("option"): n = int(opt["value"]) @@ -85,7 +86,7 @@ class manga(lib.manga): def ch(self): if self.cch is None: - page = soup(htcache.fetch(self.url)) + page = soupify(htcache.fetch(self.url)) cls = None for div in page.findAll("div", attrs={"class": "post"}): if div.h3 is not None and u"Chapter List" in div.h3.string: @@ -117,7 +118,7 @@ class library(lib.library): def byid(self, id): url = urlparse.urljoin(self.base, id + "/") - page = soup(htcache.fetch(url)) + page = soupify(htcache.fetch(url)) name = None for div in page.findAll("div", attrs={"class": "post"}): if div.h2 is not None and div.h2.a is not None: @@ -130,7 +131,7 @@ class library(lib.library): return manga(self, id, name, url) def __iter__(self): - page = soup(htcache.fetch(self.base + "Manga/")) + page = soupify(htcache.fetch(self.base + "Manga/")) for part in page.find("div", attrs={"class": "post"}).findAll("table"): for row in part.findAll("tr"): link = row.findAll("td")[1].a -- 2.11.0