开源搜索引擎abelkhan

发起一个开源项目http://www.abelkhan.com/

目前而言，已经用python编写了一个网络爬虫抓取页面，和一个简单的前端

网络爬虫，已经有很多高手写过，我基本上奉行了拿来主义，

得益于python完善的lib，这个网络爬虫实现起来非常的简单:

使用urllib2从对应的url地址抓取html

def get_page(url):

    try:

        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240',

                   'Connection':'Keep-Alive',

                   'Accept':'text/html, application/xhtml+xml, image/jxr, */*',

                   'Accept-Language':'zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.5,en;q=0.3',

                   }

        cookie_jar = cookielib.CookieJar()

        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))

        req = urllib2.Request(url = url, headers = headers)

        response = opener.open(req, timeout = 5)

        the_page = response.read()

        headers = response.info()

        return the_page, headers

    except:

        import traceback

        traceback.print_exc()

一个需要注意的地方是，有部分网站会限制爬虫访问，所以我加入了headers用于模拟浏览器访问。

这个方法差强人意，但是我也没有找到一个更完善的办法。

抓取到页面后，基于HTMLParser做了html的解析:

class htmlprocess(HTMLParser.HTMLParser):

    def __init__(self, urlinfo):

        HTMLParser.HTMLParser.__init__(self)

        self.urllist = {}

        self.sub_url = ""

        self.urlinfo = urlinfo

        self.current_url = urlinfo['url']

        keywords = doclex.simplesplit(self.current_url)

        for key in keywords:

            if key != "com" and key != "www" and key != "cn":

                self.urlinfo['keys'][''].append(key)

        self.current_tag = ""

        self.style = ""

    def handle_starttag(self, tag, attrs):

        self.current_tag = tag

        self.style = 'None'

        self.sub_url = ""

        if tag == 'meta':

            for name,value in attrs:

                if name == 'name':

                    if value == 'keywords' or value == 'metaKeywords':

                        self.style = 'keywords'

                    elif value == 'description' or value == 'metaDescription':

                        self.style = 'profile'

            for name,value in attrs:

                if name == 'content':

                    if self.style == 'keywords':

                        keywords = doclex.simplesplit(value)

                        if isinstance(keywords, list):

                            for key in keywords:

                                self.urlinfo['keys'][''].append(key)

                    elif self.style == 'profile':

                        self.urlinfo['profile'][''] = value

                    encodingdate = chardet.detect(value)

                    if encodingdate['encoding']:

                        udata = unicode(value, encodingdate['encoding'])

                        tlen = 16

                        if len(udata) < 16:

                            tlen = len(udata)

                        self.urlinfo['titlegen'].append(udata[0:tlen].encode('utf-8'))

                    else:

                        self.urlinfo['titlegen'].append(value)

        if tag == 'a' or tag == 'A' or tag == 'link':

            self.sub_url = ""

            for name,value in attrs:

                if name == 'href':

                    if len(value) == 0:

                        return

                    if not judged_url(value):

                        if self.current_url[len(self.current_url) - 1] != '/' and value[0] != '/':

                            value = self.current_url + '/' + value

                        else:

                            value = self.current_url + value

                    if value.find('javascript') != -1:

                        return

                    if value.find('javaScript') != -1:

                        return

                    if self.current_url.find("apple") != -1:

                        if value.find("http://www.apple.com/cn/mac#ac-gn-menustate") !=-1:

                            return

                    if self.current_url.find("cnblogs") != -1:

                        if value.find("http://msg.cnblogs.com/send?recipient=itwriter") != -1:

                            return

                        elif value.find("http://i.cnblogs.com/EditPosts.aspx?opt=1") != -1:

                            return

                        elif value.find("http://i.cnblogs.com/EditPosts.aspx?postid=1935371") != -1:

                            return

                        elif value.find("http://msg.cnblogs.com/send?recipient=itwriter/") != -1:

                            return

                        elif value.find("http://msg.cnblogs.com/send?recipient=itwriter/GetUsername.aspx") != -1:

                            return

                        elif value.find("/EnterMyBlog.aspx?NewArticle=1") != -1:

                            return

                        elif value.find("GetUsername") != -1:

                            return

                        elif value.find("GetMyPassword") != -1:

                            return

                        elif value.find("http://i.cnblogs.com/EditPosts.aspx?postid=") != -1:

                            return

                        elif value[len(value) - 1] == '#':

                            value = value[0:-1]

                    if self.current_url.find(value) != -1:

                        return

                    if value[len(value) - 1] == '#':

                        value = value[0:-1]

                    if value != self.current_url and len(value) < 64 and not ingoreurl(value):

                        self.urllist[value] = {'url':value, 'keys':{'':[], '':[], '':[]}, 'title':'', 'titlegen':[], 'profile':{'':'', '':'', '':[]}}

                        self.sub_url = value

                        print value

    def handle_data(self, data):

        if self.current_tag == 'title':

            try:

                data = doclex.delspace(data)

                keys = doclex.lex(data)

                if isinstance(keys, list) and len(keys) > 0:

                    for key in keys:

                        self.urlinfo['keys'][''].append(key)

                if len(data) > 0:

                    self.urlinfo['title'] = data

            except:

                import traceback

                traceback.print_exc()

        elif self.current_tag == 'a':

            try:

                if self.sub_url != "":

                    keys = doclex.simplesplit(data)

                    if isinstance(keys, list) and len(keys) > 0:

                        for key in keys:

                            if key in self.urllist[self.sub_url]['keys']['']:

                                self.urllist[self.sub_url]['keys'][''].remove(key)

                            if key not in self.urllist[self.sub_url]['keys'][''] and key not in self.urllist[self.sub_url]['keys']['']:

                                self.urllist[self.sub_url]['keys'][''].append(key)

                    encodingdate = chardet.detect(data)

                    if encodingdate['encoding']:

                        udata = unicode(data, encodingdate['encoding'])

                        tlen = 16

                        if len(udata) < 16:

                            tlen = len(udata)

                        self.urllist[self.sub_url]['titlegen'].append(udata[0:tlen].encode('utf-8'))

                        if len(udata) > 16:

                            self.urllist[self.sub_url]['profile'][''] = udata[0:32].encode('utf-8')

            except:

                import traceback

                traceback.print_exc()

        else:

            try:

                if not doclex.invialddata(data):

                    data = doclex.delspace(data)

                    encodingdate = chardet.detect(data)

                    udata = unicode(data, encodingdate['encoding'])

                    tlen = 16

                    if len(udata) < 16:

                        tlen = len(udata)

                    self.urlinfo['titlegen'].append(udata[0:tlen].encode('utf-8'))

                    if len(udata) > 32:

                        self.urlinfo['profile'][''].append((udata[0:32] + u"...").encode('utf-8'))

                    keys1 = doclex.lex(data)

                    for key in keys1:

                        self.urlinfo['keys'][''].append(key)

            except:

                import traceback

                traceback.print_exc()

基本上，要说的就是HTMLParser使用方法见文档，HTMLParser预先了定义了一组虚接口handle_starttag，handle_data和handle_endtag，使用者通过重载这三个接口，来实现对html中的tag进行处理，进而完整的解析抓取到的html。

然后从搜索结果来看，搜索的质量还很不尽如人意，欢迎大家的参与和提出意见

项目地址:http://www.abelkhan.com/

向我们提出意见:http://www.abelkhan.com/guestbook/

对项目进行捐助:http://www.abelkhan.com/collection/

代码托管地址如下:https://github.com/qianqians/websearch欢迎大家参与

秒客网

开源搜索引擎abelkhan

相关文章