2023-03-08
4:33 PM


Firstly test liburl:

Good. Nothing changed. Them apply the table.

5:09 PM

It took some time to output to Chinese filename. Gave up eventually. It even raised exception when I printed name (Chinese) to the console (decode stuff).

Then I'd write a reporter.

6:41 PM

# -*- coding: utf-8 -*-

import json
import datetime
import HTMLParser
import urlparse
import urllib
import urllib2
import cookielib
import string
import re
import sys
import threading
import os
import tempfile
from bs4 import BeautifulSoup
from prettytable import PrettyTable
import Tkinter reload(sys)
sys.setdefaultencoding("utf-8") csv = [line.split(',') for line in open('result_utf8.csv')] def getname(no):
for i in csv:
if i[1] == no:
return i[0]
return '' def getcourse(filename):
s = open(filename).read()
i = s.find('退选') if i != -1:
trbegin = s.find('<tr>', i) # s[trbegin...] e.g.
# <tr>
# <td width="10%">
# <a id="GridView1_ctl02_LinkButton1" href="ja
# vascript:__doPostBack('GridView1$ctl02$LinkButton1','')">348</a>
# </td><td>12</td><td>生物培优班</td><td>xxx</td><td>&n
# bsp;</td><td width="10%">
# <a id="GridView1_ctl02_LinkButton2" href="ja
# vascript:__doPostBack('GridView1$ctl02$LinkButton2','')">退选</a>
# </td> trend = s.find('</tr>', trbegin) read = 0
res = '' i = trbegin
while i < trend:
if s[i] == '<':
while s[i] != '>':
i += 1
i += 1
continue end = False
while s[i] != '<':
if s[i] == '&':
end = True
res += s[i]
i += 1 if end:
break res += ' ' res2 = ''
i = 0
while i < len(res) and not (res[i] in "0123456789"):
i += 1 while i < len(res):
if res[i] == '\n':
i += 1
res2 += res[i]
if res[i] == ' ':
while i < len(res) and res[i] == ' ':
i += 1
i += 1 return res2
return '' def report():
with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:
wd = workdir.get() os.chdir(wd)
f.write('<head><meta charset="utf-8"></head>')
f.write('<h1>Spider report</h1>')
f.write('<p><b>Version {}</b></p>'.format(wd))
f.write('<table>') for i in os.listdir('.'):
(no, x1) = i.split('.')
name = getname(no)
s = getcourse(i)
f.write('<tr><th>{}</th><th>{}</th><td>{}</td></tr>'.format(no, name, s)) os.system('python -m webbrowser {}'.format(f.name))
os.chdir('..') gui = Tkinter.Tk()
workdir = Tkinter.StringVar()
Tkinter.Button(gui, text="Report", command=report).pack(side=Tkinter.LEFT)
Tkinter.Entry(gui, textvariable=workdir, width=40).pack(side=Tkinter.LEFT)