Fix decoding of some pages
This commit is contained in:
parent
63f24c7b59
commit
fa77a3b323
@ -52,7 +52,7 @@ def get_synos(word):
|
||||
page = web.getURLContent(url)
|
||||
if page is not None:
|
||||
synos = list()
|
||||
for line in page.decode().split("\n"):
|
||||
for line in page.split("\n"):
|
||||
if re.match("[ \t]*<tr[^>]*>.*</tr>[ \t]*</table>.*", line) is not None:
|
||||
for elt in re.finditer(">&[^;]+;([^&]*)&[^;]+;<", line):
|
||||
synos.append(elt.group(1))
|
||||
|
@ -95,8 +95,9 @@ def getURLContent(url, timeout=15):
|
||||
data = res.read(size)
|
||||
|
||||
# Decode content
|
||||
charset = res.getheader("Content-Type").split(";")
|
||||
if len(charset) > 1:
|
||||
charset = "utf-8"
|
||||
lcharset = res.getheader("Content-Type").split(";")
|
||||
if len(lcharset) > 1:
|
||||
for c in charset:
|
||||
ch = c.split("=")
|
||||
if ch[0].strip().lower() == "charset" and len(ch) > 1:
|
||||
@ -105,14 +106,13 @@ def getURLContent(url, timeout=15):
|
||||
charset = cha[1]
|
||||
else:
|
||||
charset = cha[0]
|
||||
data = data.decode(charset)
|
||||
except http.client.BadStatusLine:
|
||||
return None
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if res.status == http.client.OK or res.status == http.client.SEE_OTHER:
|
||||
return data
|
||||
return data.decode(charset)
|
||||
elif res.status == http.client.FOUND or res.status == http.client.MOVED_PERMANENTLY:
|
||||
return getURLContent(res.getheader("Location"), timeout)
|
||||
else:
|
||||
|
Loading…
x
Reference in New Issue
Block a user