Fix decoding of some pages
This commit is contained in:
parent
63f24c7b59
commit
fa77a3b323
@ -52,7 +52,7 @@ def get_synos(word):
|
|||||||
page = web.getURLContent(url)
|
page = web.getURLContent(url)
|
||||||
if page is not None:
|
if page is not None:
|
||||||
synos = list()
|
synos = list()
|
||||||
for line in page.decode().split("\n"):
|
for line in page.split("\n"):
|
||||||
if re.match("[ \t]*<tr[^>]*>.*</tr>[ \t]*</table>.*", line) is not None:
|
if re.match("[ \t]*<tr[^>]*>.*</tr>[ \t]*</table>.*", line) is not None:
|
||||||
for elt in re.finditer(">&[^;]+;([^&]*)&[^;]+;<", line):
|
for elt in re.finditer(">&[^;]+;([^&]*)&[^;]+;<", line):
|
||||||
synos.append(elt.group(1))
|
synos.append(elt.group(1))
|
||||||
|
@ -95,8 +95,9 @@ def getURLContent(url, timeout=15):
|
|||||||
data = res.read(size)
|
data = res.read(size)
|
||||||
|
|
||||||
# Decode content
|
# Decode content
|
||||||
charset = res.getheader("Content-Type").split(";")
|
charset = "utf-8"
|
||||||
if len(charset) > 1:
|
lcharset = res.getheader("Content-Type").split(";")
|
||||||
|
if len(lcharset) > 1:
|
||||||
for c in charset:
|
for c in charset:
|
||||||
ch = c.split("=")
|
ch = c.split("=")
|
||||||
if ch[0].strip().lower() == "charset" and len(ch) > 1:
|
if ch[0].strip().lower() == "charset" and len(ch) > 1:
|
||||||
@ -105,14 +106,13 @@ def getURLContent(url, timeout=15):
|
|||||||
charset = cha[1]
|
charset = cha[1]
|
||||||
else:
|
else:
|
||||||
charset = cha[0]
|
charset = cha[0]
|
||||||
data = data.decode(charset)
|
|
||||||
except http.client.BadStatusLine:
|
except http.client.BadStatusLine:
|
||||||
return None
|
return None
|
||||||
finally:
|
finally:
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
if res.status == http.client.OK or res.status == http.client.SEE_OTHER:
|
if res.status == http.client.OK or res.status == http.client.SEE_OTHER:
|
||||||
return data
|
return data.decode(charset)
|
||||||
elif res.status == http.client.FOUND or res.status == http.client.MOVED_PERMANENTLY:
|
elif res.status == http.client.FOUND or res.status == http.client.MOVED_PERMANENTLY:
|
||||||
return getURLContent(res.getheader("Location"), timeout)
|
return getURLContent(res.getheader("Location"), timeout)
|
||||||
else:
|
else:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user