Decode HTML entities correctly.

author Fredrik Tolf <fredrik@dolda2000.com>

Thu, 22 May 2014 22:15:38 +0000 (00:15 +0200)

committer Fredrik Tolf <fredrik@dolda2000.com>

Thu, 22 May 2014 22:15:38 +0000 (00:15 +0200)
author Fredrik Tolf <fredrik@dolda2000.com>
Thu, 22 May 2014 22:15:38 +0000 (00:15 +0200)
committer Fredrik Tolf <fredrik@dolda2000.com>
Thu, 22 May 2014 22:15:38 +0000 (00:15 +0200)
diff --git a/manga/batoto.py b/manga/batoto.py

index 8593e04..baa602f 100644 (file)
--- a/manga/batoto.py
+++ b/manga/batoto.py
@@ -1,6 +1,7 @@
  import urllib, re, BeautifulSoup
  import lib, htcache
  soup = BeautifulSoup.BeautifulSoup
  import urllib, re, BeautifulSoup
  import lib, htcache
  soup = BeautifulSoup.BeautifulSoup
+soupify = lambda cont: soup(cont, convertEntities=soup.HTML_ENTITIES)
  
  def byclass(el, name, cl):
      for ch in el.findAll(name):
  
  def byclass(el, name, cl):
      for ch in el.findAll(name):
@@ -28,7 +29,7 @@ class page(lib.page):
  
      def iurl(self):
          if self.ciurl is None:
  
      def iurl(self):
          if self.ciurl is None:
-            page = soup(htcache.fetch(self.url))
+            page = soupify(htcache.fetch(self.url))
              img = nextel(page.find("div", id="full_image")).img
              self.ciurl = img["src"].encode("us-ascii")
          return self.ciurl
              img = nextel(page.find("div", id="full_image")).img
              self.ciurl = img["src"].encode("us-ascii")
          return self.ciurl
@@ -60,7 +61,7 @@ class chapter(lib.pagelist):
      pnre = re.compile(r"page (\d+)")
      def pages(self):
          if self.cpag is None:
      pnre = re.compile(r"page (\d+)")
      def pages(self):
          if self.cpag is None:
-            pg = soup(htcache.fetch(self.url))
+            pg = soupify(htcache.fetch(self.url))
              cpag = []
              for opt in pg.find("select", id="page_select").findAll("option"):
                  url = opt["value"].encode("us-ascii")
              cpag = []
              for opt in pg.find("select", id="page_select").findAll("option"):
                  url = opt["value"].encode("us-ascii")
@@ -94,7 +95,7 @@ class manga(lib.manga):
      cure = re.compile(r"/read/_/(\d+)/[^/]*")
      def ch(self):
          if self.cch is None:
      cure = re.compile(r"/read/_/(\d+)/[^/]*")
      def ch(self):
          if self.cch is None:
-            page = soup(htcache.fetch(self.url))
+            page = soupify(htcache.fetch(self.url))
              cls = byclass(page, u"table", u"chapters_list")
              if cls.tbody is not None:
                  cls = cls.tbody
              cls = byclass(page, u"table", u"chapters_list")
              if cls.tbody is not None:
                  cls = cls.tbody
@@ -120,7 +121,7 @@ class manga(lib.manga):
  
      def altnames(self):
          if self.cnames is None:
  
      def altnames(self):
          if self.cnames is None:
-            page = soup(htcache.fetch(self.url))
+            page = soupify(htcache.fetch(self.url))
              cnames = None
              for tbl in page.findAll("table", attrs={"class": "ipb_table"}):
                  if tbl.tbody is not None: tbl = tbl.tbody
              cnames = None
              for tbl in page.findAll("table", attrs={"class": "ipb_table"}):
                  if tbl.tbody is not None: tbl = tbl.tbody
@@ -151,7 +152,7 @@ class library(lib.library):
  
      def byid(self, id):
          url = self.base + "comic/_/comics/" + id
  
      def byid(self, id):
          url = self.base + "comic/_/comics/" + id
-        page = soup(htcache.fetch(url))
+        page = soupify(htcache.fetch(url))
          title = page.find("h1", attrs={"class": "ipsType_pagetitle"})
          if title is None:
              raise KeyError(id)
          title = page.find("h1", attrs={"class": "ipsType_pagetitle"})
          if title is None:
              raise KeyError(id)
@@ -164,7 +165,7 @@ class library(lib.library):
              _pars["p"] = str(p)
              resp = urllib.urlopen(self.base + "search?" + urllib.urlencode(_pars))
              try:
              _pars["p"] = str(p)
              resp = urllib.urlopen(self.base + "search?" + urllib.urlencode(_pars))
              try:
-                page = soup(resp.read())
+                page = soupify(resp.read())
              finally:
                  resp.close()
              rls = page.find("div", id="comic_search_results").table
              finally:
                  resp.close()
              rls = page.find("div", id="comic_search_results").table
diff --git a/manga/mangafox.py b/manga/mangafox.py

index 9831a81..ef84eb0 100644 (file)
--- a/manga/mangafox.py
+++ b/manga/mangafox.py
@@ -2,6 +2,7 @@ import urllib, re
  import BeautifulSoup, json
  import lib, htcache
  soup = BeautifulSoup.BeautifulSoup
  import BeautifulSoup, json
  import lib, htcache
  soup = BeautifulSoup.BeautifulSoup
+soupify = lambda cont: soup(cont, convertEntities=soup.HTML_ENTITIES)
  
  class page(lib.page):
      def __init__(self, chapter, stack, n, url):
  
  class page(lib.page):
      def __init__(self, chapter, stack, n, url):
@@ -17,7 +18,7 @@ class page(lib.page):
  
      def iurl(self):
          if self.ciurl is None:
  
      def iurl(self):
          if self.ciurl is None:
-            page = soup(htcache.fetch(self.url))
+            page = soupify(htcache.fetch(self.url))
              self.ciurl = page.find("div", id="viewer").find("img", id="image")["src"]
          return self.ciurl
  
              self.ciurl = page.find("div", id="viewer").find("img", id="image")["src"]
          return self.ciurl
  
@@ -48,7 +49,7 @@ class chapter(lib.pagelist):
  
      def pages(self):
          if self.cpag is None:
  
      def pages(self):
          if self.cpag is None:
-            pg = soup(htcache.fetch(self.url + "1.html"))
+            pg = soupify(htcache.fetch(self.url + "1.html"))
              l = pg.find("form", id="top_bar").find("div", attrs={"class": "l"})
              if len(l.contents) != 3:
                  raise Exception("parse error: weird page list for %r" % self)
              l = pg.find("form", id="top_bar").find("div", attrs={"class": "l"})
              if len(l.contents) != 3:
                  raise Exception("parse error: weird page list for %r" % self)
@@ -109,7 +110,7 @@ class manga(lib.manga):
  
      def vols(self):
          if self.cvol is None:
  
      def vols(self):
          if self.cvol is None:
-            page = soup(htcache.fetch(self.url))
+            page = soupify(htcache.fetch(self.url))
              vls = page.find("div", id="chapters").findAll("div", attrs={"class": "slide"})
              cvol = []
              for i, vn in enumerate(reversed(vls)):
              vls = page.find("div", id="chapters").findAll("div", attrs={"class": "slide"})
              cvol = []
              for i, vn in enumerate(reversed(vls)):
@@ -155,7 +156,7 @@ class library(lib.library):
          self.base = "http://mangafox.me/"
  
      def alphapage(self, pno):
          self.base = "http://mangafox.me/"
  
      def alphapage(self, pno):
-        page = soup(htcache.fetch(self.base + ("directory/%i.htm?az" % pno)))
+        page = soupify(htcache.fetch(self.base + ("directory/%i.htm?az" % pno)))
          ls = page.find("div", id="mangalist").find("ul", attrs={"class": "list"}).findAll("li")
          ret = []
          ubase = self.base + "manga/"
          ls = page.find("div", id="mangalist").find("ul", attrs={"class": "list"}).findAll("li")
          ret = []
          ubase = self.base + "manga/"
@@ -169,7 +170,7 @@ class library(lib.library):
          return ret
  
      def alphapages(self):
          return ret
  
      def alphapages(self):
-        page = soup(htcache.fetch(self.base + "directory/?az"))
+        page = soupify(htcache.fetch(self.base + "directory/?az"))
          ls = page.find("div", id="mangalist").find("div", id="nav").find("ul").findAll("li")
          return int(ls[-2].find("a").string)
  
          ls = page.find("div", id="mangalist").find("div", id="nav").find("ul").findAll("li")
          return int(ls[-2].find("a").string)
  
@@ -217,7 +218,7 @@ class library(lib.library):
  
      def byid(self, id):
          url = self.base + ("manga/%s/" % id)
  
      def byid(self, id):
          url = self.base + ("manga/%s/" % id)
-        page = soup(htcache.fetch(url))
+        page = soupify(htcache.fetch(url))
          if page.find("div", id="title") is None:
              # Assume we got the search page
              raise KeyError(id)
          if page.find("div", id="title") is None:
              # Assume we got the search page
              raise KeyError(id)
diff --git a/manga/mrnet.py b/manga/mrnet.py

index 1439f09..0505296 100644 (file)
--- a/manga/mrnet.py
+++ b/manga/mrnet.py
@@ -1,6 +1,7 @@
  import BeautifulSoup, urlparse
  import lib, htcache
  soup = BeautifulSoup.BeautifulSoup
  import BeautifulSoup, urlparse
  import lib, htcache
  soup = BeautifulSoup.BeautifulSoup
+soupify = lambda cont: soup(cont, convertEntities=soup.HTML_ENTITIES)
  
  class page(lib.page):
      def __init__(self, chapter, stack, n, url):
  
  class page(lib.page):
      def __init__(self, chapter, stack, n, url):
@@ -15,7 +16,7 @@ class page(lib.page):
  
      def iurl(self):
          if self.ciurl is None:
  
      def iurl(self):
          if self.ciurl is None:
-            page = soup(htcache.fetch(self.url))
+            page = soupify(htcache.fetch(self.url))
              self.ciurl = page.find("div", id="imgholder").find("img", id="img")["src"].encode("us-ascii")
          return self.ciurl
  
              self.ciurl = page.find("div", id="imgholder").find("img", id="img")["src"].encode("us-ascii")
          return self.ciurl
  
@@ -45,7 +46,7 @@ class chapter(lib.pagelist):
  
      def pages(self):
          if self.cpag is None:
  
      def pages(self):
          if self.cpag is None:
-            pg = soup(htcache.fetch(self.url))
+            pg = soupify(htcache.fetch(self.url))
              pag = []
              for opt in pg.find("div", id="selectpage").find("select", id="pageMenu").findAll("option"):
                  url = urlparse.urljoin(self.url, opt["value"].encode("us-ascii"))
              pag = []
              for opt in pg.find("div", id="selectpage").find("select", id="pageMenu").findAll("option"):
                  url = urlparse.urljoin(self.url, opt["value"].encode("us-ascii"))
@@ -77,7 +78,7 @@ class manga(lib.manga):
  
      def ch(self):
          if self.cch is None:
  
      def ch(self):
          if self.cch is None:
-            page = soup(htcache.fetch(self.url))
+            page = soupify(htcache.fetch(self.url))
              cls = page.find("div", id="chapterlist").find("table", id="listing")
              i = 0
              cch = []
              cls = page.find("div", id="chapterlist").find("table", id="listing")
              i = 0
              cch = []
@@ -108,14 +109,14 @@ class library(lib.library):
  
      def byid(self, id):
          url = self.base + id
  
      def byid(self, id):
          url = self.base + id
-        page = soup(htcache.fetch(url))
+        page = soupify(htcache.fetch(url))
          if page.find("h2", attrs={"class": "aname"}) is None:
              raise KeyError(id)
          name = page.find("h2", attrs={"class": "aname"}).string
          return manga(self, id, name, url)
  
      def __iter__(self):
          if page.find("h2", attrs={"class": "aname"}) is None:
              raise KeyError(id)
          name = page.find("h2", attrs={"class": "aname"}).string
          return manga(self, id, name, url)
  
      def __iter__(self):
-        page = soup(htcache.fetch(self.base + "alphabetical"))
+        page = soupify(htcache.fetch(self.base + "alphabetical"))
          for sec in page.findAll("div", attrs={"class": "series_alpha"}):
              for li in sec.find("ul", attrs={"class": "series_alpha"}).findAll("li"):
                  url = li.a["href"].encode("us-ascii")
          for sec in page.findAll("div", attrs={"class": "series_alpha"}):
              for li in sec.find("ul", attrs={"class": "series_alpha"}).findAll("li"):
                  url = li.a["href"].encode("us-ascii")
diff --git a/manga/rawsen.py b/manga/rawsen.py

index 214a4b8..a87d52e 100644 (file)
--- a/manga/rawsen.py
+++ b/manga/rawsen.py
@@ -1,6 +1,7 @@
  import BeautifulSoup, urlparse
  import lib, htcache
  soup = BeautifulSoup.BeautifulSoup
  import BeautifulSoup, urlparse
  import lib, htcache
  soup = BeautifulSoup.BeautifulSoup
+soupify = lambda cont: soup(cont, convertEntities=soup.HTML_ENTITIES)
  
  class page(lib.page):
      def __init__(self, chapter, stack, n, url):
  
  class page(lib.page):
      def __init__(self, chapter, stack, n, url):
@@ -15,7 +16,7 @@ class page(lib.page):
  
      def iurl(self):
          if self.ciurl is None:
  
      def iurl(self):
          if self.ciurl is None:
-            page = soup(htcache.fetch(self.url))
+            page = soupify(htcache.fetch(self.url))
              for tr in page.findAll("tr"):
                  img = tr.find("img", id="picture")
                  if img is not None:
              for tr in page.findAll("tr"):
                  img = tr.find("img", id="picture")
                  if img is not None:
@@ -53,7 +54,7 @@ class chapter(lib.pagelist):
              if self.url[-2:] != "/1":
                  raise Exception("parse error: unexpected first page url for %r" % self)
              base = self.url[:-1]
              if self.url[-2:] != "/1":
                  raise Exception("parse error: unexpected first page url for %r" % self)
              base = self.url[:-1]
-            pg = soup(htcache.fetch(self.url))
+            pg = soupify(htcache.fetch(self.url))
              pag = []
              for opt in pg.find("div", attrs={"class": "pager"}).find("select", attrs={"name": "page"}).findAll("option"):
                  n = int(opt["value"])
              pag = []
              for opt in pg.find("div", attrs={"class": "pager"}).find("select", attrs={"name": "page"}).findAll("option"):
                  n = int(opt["value"])
@@ -85,7 +86,7 @@ class manga(lib.manga):
  
      def ch(self):
          if self.cch is None:
  
      def ch(self):
          if self.cch is None:
-            page = soup(htcache.fetch(self.url))
+            page = soupify(htcache.fetch(self.url))
              cls = None
              for div in page.findAll("div", attrs={"class": "post"}):
                  if div.h3 is not None and u"Chapter List" in div.h3.string:
              cls = None
              for div in page.findAll("div", attrs={"class": "post"}):
                  if div.h3 is not None and u"Chapter List" in div.h3.string:
@@ -117,7 +118,7 @@ class library(lib.library):
  
      def byid(self, id):
          url = urlparse.urljoin(self.base, id + "/")
  
      def byid(self, id):
          url = urlparse.urljoin(self.base, id + "/")
-        page = soup(htcache.fetch(url))
+        page = soupify(htcache.fetch(url))
          name = None
          for div in page.findAll("div", attrs={"class": "post"}):
              if div.h2 is not None and div.h2.a is not None:
          name = None
          for div in page.findAll("div", attrs={"class": "post"}):
              if div.h2 is not None and div.h2.a is not None:
@@ -130,7 +131,7 @@ class library(lib.library):
          return manga(self, id, name, url)
  
      def __iter__(self):
          return manga(self, id, name, url)
  
      def __iter__(self):
-        page = soup(htcache.fetch(self.base + "Manga/"))
+        page = soupify(htcache.fetch(self.base + "Manga/"))
          for part in page.find("div", attrs={"class": "post"}).findAll("table"):
              for row in part.findAll("tr"):
                  link = row.findAll("td")[1].a
          for part in page.find("div", attrs={"class": "post"}).findAll("table"):
              for row in part.findAll("tr"):
                  link = row.findAll("td")[1].a
author	Fredrik Tolf <fredrik@dolda2000.com>
	Thu, 22 May 2014 22:15:38 +0000 (00:15 +0200)
committer	Fredrik Tolf <fredrik@dolda2000.com>
	Thu, 22 May 2014 22:15:38 +0000 (00:15 +0200)
manga/batoto.py		patch \| blob \| blame \| history
manga/mangafox.py		patch \| blob \| blame \| history
manga/mrnet.py		patch \| blob \| blame \| history
manga/rawsen.py		patch \| blob \| blame \| history