Fix decoding of some pages

This commit is contained in:
nemunaire 2014-07-08 02:44:20 +02:00
parent 63f24c7b59
commit fa77a3b323
2 changed files with 5 additions and 5 deletions

View File

@ -52,7 +52,7 @@ def get_synos(word):
page = web.getURLContent(url)
if page is not None:
synos = list()
for line in page.decode().split("\n"):
for line in page.split("\n"):
if re.match("[ \t]*<tr[^>]*>.*</tr>[ \t]*</table>.*", line) is not None:
for elt in re.finditer(">&[^;]+;([^&]*)&[^;]+;<", line):
synos.append(elt.group(1))

View File

@ -95,8 +95,9 @@ def getURLContent(url, timeout=15):
data = res.read(size)
# Decode content
charset = res.getheader("Content-Type").split(";")
if len(charset) > 1:
charset = "utf-8"
lcharset = res.getheader("Content-Type").split(";")
if len(lcharset) > 1:
for c in charset:
ch = c.split("=")
if ch[0].strip().lower() == "charset" and len(ch) > 1:
@ -105,14 +106,13 @@ def getURLContent(url, timeout=15):
charset = cha[1]
else:
charset = cha[0]
data = data.decode(charset)
except http.client.BadStatusLine:
return None
finally:
conn.close()
if res.status == http.client.OK or res.status == http.client.SEE_OTHER:
return data
return data.decode(charset)
elif res.status == http.client.FOUND or res.status == http.client.MOVED_PERMANENTLY:
return getURLContent(res.getheader("Location"), timeout)
else: