Decode HTML entities correctly.
[automanga.git] / manga / rawsen.py
CommitLineData
50f7a215
FT
1import BeautifulSoup, urlparse
2import lib, htcache
3soup = BeautifulSoup.BeautifulSoup
c0d3b1a2 4soupify = lambda cont: soup(cont, convertEntities=soup.HTML_ENTITIES)
50f7a215
FT
5
6class page(lib.page):
7 def __init__(self, chapter, stack, n, url):
8 self.stack = stack
9 self.chapter = chapter
10 self.manga = chapter.manga
11 self.n = n
12 self.id = str(n)
13 self.name = u"Page " + unicode(n)
14 self.url = url
15 self.ciurl = None
16
17 def iurl(self):
18 if self.ciurl is None:
c0d3b1a2 19 page = soupify(htcache.fetch(self.url))
50f7a215
FT
20 for tr in page.findAll("tr"):
21 img = tr.find("img", id="picture")
22 if img is not None:
23 self.ciurl = urlparse.urljoin(self.url, img["src"].encode("us-ascii"))
24 if self.ciurl is None:
25 raise Exception("parse error: could not find image url for %r" % self)
26 return self.ciurl
27
28 def open(self):
29 return lib.stdimgstream(self.iurl())
30
31 def __str__(self):
32 return self.name
33
34 def __repr__(self):
35 return "<rawsen.page %r.%r.%r>" % (self.manga.name, self.chapter.name, self.name)
36
37class chapter(lib.pagelist):
38 def __init__(self, manga, stack, id, name, url):
39 self.stack = stack
40 self.manga = manga
41 self.id = id
42 self.name = name
43 self.url = url
44 self.cpag = None
45
46 def __getitem__(self, i):
47 return self.pages()[i]
48
49 def __len__(self):
50 return len(self.pages())
51
52 def pages(self):
53 if self.cpag is None:
54 if self.url[-2:] != "/1":
55 raise Exception("parse error: unexpected first page url for %r" % self)
56 base = self.url[:-1]
c0d3b1a2 57 pg = soupify(htcache.fetch(self.url))
50f7a215
FT
58 pag = []
59 for opt in pg.find("div", attrs={"class": "pager"}).find("select", attrs={"name": "page"}).findAll("option"):
60 n = int(opt["value"])
61 url = urlparse.urljoin(base, str(n))
62 pag.append(page(self, self.stack + [(self, len(pag))], n, url))
63 self.cpag = pag
64 return self.cpag
65
66 def __str__(self):
67 return self.name
68
69 def __repr__(self):
70 return "<rawsen.chapter %r.%r>" % (self.manga.name, self.name)
71
72class manga(lib.manga):
73 def __init__(self, lib, id, name, url):
74 self.lib = lib
75 self.id = id
76 self.name = name
77 self.url = url
78 self.cch = None
79 self.stack = []
80
81 def __getitem__(self, i):
82 return self.ch()[i]
83
84 def __len__(self):
85 return len(self.ch())
86
87 def ch(self):
88 if self.cch is None:
c0d3b1a2 89 page = soupify(htcache.fetch(self.url))
50f7a215
FT
90 cls = None
91 for div in page.findAll("div", attrs={"class": "post"}):
92 if div.h3 is not None and u"Chapter List" in div.h3.string:
93 cls = div
94 break
95 if cls is None:
96 raise Exception("parse error: no chapter list found for %r" % self)
97 cch = []
98 for tr in cls.table.findAll("tr"):
99 lcol = tr.findAll("td")[1]
100 if lcol.a is None: continue
101 link = lcol.a
102 url = link["href"].encode("us-ascii")
103 name = link["title"]
104 cid = name.encode("utf-8")
105 cch.append(chapter(self, [(self, len(cch))], cid, name, url))
106 self.cch = cch
107 return self.cch
108
109 def __str__(self):
110 return self.name
111
112 def __repr__(self):
113 return "<rawsen.manga %r>" % self.name
114
115class library(lib.library):
116 def __init__(self):
117 self.base = "http://raw.senmanga.com/"
118
119 def byid(self, id):
120 url = urlparse.urljoin(self.base, id + "/")
c0d3b1a2 121 page = soupify(htcache.fetch(url))
50f7a215
FT
122 name = None
123 for div in page.findAll("div", attrs={"class": "post"}):
124 if div.h2 is not None and div.h2.a is not None:
125 curl = div.h2.a["href"].encode("us-ascii")
126 if curl[-1] != '/' or curl.rfind('/', 0, -1) < 0: continue
127 if curl[curl.rindex('/', 0, -1) + 1:-1] != id: continue
128 name = div.h2.a.string
129 if name is None:
130 raise KeyError(id)
131 return manga(self, id, name, url)
132
133 def __iter__(self):
c0d3b1a2 134 page = soupify(htcache.fetch(self.base + "Manga/"))
50f7a215
FT
135 for part in page.find("div", attrs={"class": "post"}).findAll("table"):
136 for row in part.findAll("tr"):
137 link = row.findAll("td")[1].a
138 if link is None:
139 continue
140 url = link["href"].encode("us-ascii")
141 name = link.string
142 if len(url) < 3 or url[:1] != '/' or url[-1:] != '/':
143 continue
144 id = url[1:-1]
145 yield manga(self, id, name, urlparse.urljoin(self.base, url))
146
147 def byname(self, prefix):
148 if not isinstance(prefix, unicode):
149 prefix = prefix.decode("utf8")
150 prefix = prefix.lower()
151 for manga in self:
152 if manga.name.lower()[:len(prefix)] == prefix:
153 yield manga
154
155 def search(self, expr):
156 if not isinstance(expr, unicode):
157 expr = expr.decode("utf8")
158 expr = expr.lower()
159 for manga in self:
160 if expr in manga.name.lower():
161 yield manga