java调用Linux执行Python爬虫，并将数据存储到elasticsearch--（环境脚本搭建）

java调用Linux执行Python爬虫，并将数据存储到elasticsearch中

一、以下博客代码使用的开发工具及环境如下：

1、idea：

2、jdk：1.8

3、elasticsearch：5.2.0

4、Linux

5、Python

6、maven

二、maven坐标：

<!--java连接ulinix脚本架包-->

        <dependency>

            <groupId>ch.ethz.ganymed</groupId>

            <artifactId>ganymed-ssh2</artifactId>

            <version>build209</version>

        </dependency>

        <dependency>

            <groupId>commons-io</groupId>

            <artifactId>commons-io</artifactId>

            <version>2.4</version>

            <type>jar</type>

            <scope>compile</scope>

        </dependency>

        <dependency>

            <groupId>commons-lang</groupId>

            <artifactId>commons-lang</artifactId>

            <version>2.6</version>

            <type>jar</type>

            <scope>compile</scope>

        </dependency>
<!--es相关坐标-->

<dependency>
    <groupId>org.elasticsearch.plugin</groupId>
    <artifactId>transport-netty4-client</artifactId>
    <version>5.2.0</version>
</dependency>
<dependency>
    <groupId>org.elasticsearch</groupId>
    <artifactId>elasticsearch</artifactId>
    <version>5.2.0</version>
</dependency>
<dependency>
    <groupId>org.nlpcn</groupId>
    <artifactId>elasticsearch-sql</artifactId>
    <version>6.3.0.0</version>
</dependency>
<dependency>
    <groupId>com.alibaba</groupId>
    <artifactId>druid</artifactId>
    <version>1.1.9</version>
</dependency>
<dependency>
    <groupId>org.elasticsearch.client</groupId>
    <artifactId>transport</artifactId>
    <version>5.2.0</version>
</dependency>

二、Linux脚本

cd /usr/local/python3/lib/python3.6/site-packages

python linux_sina.py &

三、Python爬虫脚本

#!/usr/bin/python

# -*- coding: utf-8 -*-

"""

Created on Mon Aug 13 10:12:56 2018

@author: Administrator

"""

import public_python as p

import urllib.request

from bs4 import BeautifulSoup

#存储到ES

from elasticsearch import Elasticsearch

import flask

from flask import request

server = flask.Flask(__name__)

from selenium import webdriver

@server.route('/news',methods=['get','post'])

def get_html2():

    #异常处理机制：

    #声明全局变量

    #global title,time,strcon,cos

    try:

        url=request.values.get("url")

        #存储的新闻类型----用于数据库表中的字段

        theme_id=request.values.get("theme_id")#----guonei--新闻类型 

        #*面启动

        firefox_options = webdriver.FirefoxOptions()

        firefox_options.add_argument('--headless')

        browser = webdriver.Firefox(firefox_options=firefox_options)

        #超时设置

        #timeout=request.values.get("timeout")

        '''设置加载时间'''

        browser.set_page_load_timeout(30000)

        browser.set_script_timeout(30000)#这两种设置都进行才有效

        browser.get(url)

        '''

        判断url是否包含多个链接，如果包含--即URL值不会是以.shtml结尾的字符串

        那么就去获取链接里面所有的以.shtml结尾的链接url值。

        '''

        #存放页面中具体的url链接

        data_urls=[]

        if ".shtml" not in url:

            list_father=browser.find_element_by_tag_name('body')#先定位大的标签

            #存放所有<a>链接

            ahref=list_father.find_elements_by_tag_name("a")#在打标签下寻找确定的小标签的集合-需要：elements。

            '''

            对标签<a>进行遍历，获取其中的属性为：href的值，

            然后过滤判断，如果包含.shtml，那么将href的值保存到数组，并且对数组进行set去重。

            如果没包含.shtml，那么跳过本次循环。

            最后调用获取数据的方法进行爬取数据

            '''

            for ah in ahref:#具体小标签中包含内容较多，所以还需要遍历 

                hr=ah.get_attribute("href")#获取连接的值

                if ".shtml" in hr: #对链接进行遍历过滤

                      data_urls.append(hr)

                      links=set(data_urls)#去重

                else:

                      pass

                      continue

            #getcontent(links)#调用获取内容的方法：

        '''

         判断url是否包含多个链接，如果url包含。shtml，那么代表此url是单页的。

         那么直接将url存入数组，然后调用获取数据的方法

        '''

        if ".shtml"  in url:

            data_urls.append(url)

            links=set(data_urls)#去重

            #getcontent(links)

        '''

        #最后均去调用获取内容的方法：

        '''

        getcontent(links)

        return "ok"

        #测试

        #data_urls中包含网页中所有.shtml的链接。

        '''

        for i in range(len(data_urls)):

              print(data_urls[i])

        '''

        browser.quit()#完成后退出关闭浏览器

    except urllib.error.URLError as e :

        if hasattr(e,"code"):

            print(e.code)

            return e.code

        if hasattr(e,"reason"):

            print(e.reason)

            return e.reason

    except Exception as e:

        print("exception:"+str(e))

        return e

def getcontent(data_urls):

    #异常处理机制：

    try:

        global title,time,strcon,cos

        from selenium import webdriver

        #存储的新闻类型----用于数据库表中的字段

        theme_id=request.values.get("theme_id")#----guonei--新闻类型

        #图片的前缀链接域名

        #domain_name=request.values.get("domain_name")

        #*面启动

        firefox_options = webdriver.FirefoxOptions()

        firefox_options.add_argument('--headless')

        browser = webdriver.Firefox(firefox_options=firefox_options)

        '''

        连接ES集群，获取es

        '''

        es=Elasticsearch(hosts=['192.168.200.211:9201'], maxsize=25)

        '''

        对传递过来的url数组进行遍历，获取每一个具体链接url（以.shtml结尾的url）

        '''

        for url in data_urls:

            print(url) #具体每个标题链接,后缀是.shtml

            browser.get(url)

            conlist=[] #存储文章内容，先以数组进行保存

            #判断源代码中id的值：artibodyTitle是否存在，存在的话将值内容赋值给标题变量

            if "artibodyTitle" in browser.page_source:

                title=browser.find_element_by_id("artibodyTitle").text

            #判断class的值：main-title是否存在，存在的话将值内容赋值给标题变量

            elif "main-title" in browser.page_source:

                title=browser.find_element_by_class_name("main-title").text

                '''

                以上是对新浪网新闻中标题存在的不同形式的判断。

                '''

            else:

                #跳过异常错误 继续执行

                #pass

                #continue

                title="无法爬取标题"

            print(title)

            '''

            以下是获取时间的方式。

            '''

            if "pub_date" in browser.page_source:

                  time=browser.find_element_by_id("pub_date").text

            elif "date" in browser.page_source:

                  time=browser.find_element_by_class_name("date").text

            else:

                 #pass

                 #continue

                 time="无法爬取时间"

            #print(time)

            '''

            以下是去获取正文内容。

            先：打开url，并读取内容

            '''

            file=urllib.request.urlopen(url=url,timeout=30000)

            data=file.read().decode('utf-8','ignore')

            soup=BeautifulSoup(data,'lxml')

            '''#存实际的所有正文的内容--不以数组形式存储'''

            strcon=""

            '''

            #内容有2中情况，一种在class=article的div下(class以“.”表示)，

                          一种在id=artibody的div下（id以“#”表示）

            '''

            if soup.select('.article ') :

                 cos=soup.select('.article ')

            elif  soup.select('#artibody '):

                 cos=soup.select('#artibody ')

            '''

            判断包含内容的标签是否存在，以下代码均是在内容存在的情况下进行

            '''

            if cos:

                 for i in cos:

                      '''

                      遍历内容标签，查找包含图片和段落的标签（img和p）,

                      结果是bs4.element.ResultSet集合。

                      集合中每一个元素是一个tag标签

                      '''

                      alls=i.find_all(["img","p"])#传入一个字符串的列表，将匹配列表中标签的Tag全部返回

                      #print(type(alls))       #<class 'bs4.element.ResultSet'>

                      #(type(alls[0]))         #<class 'bs4.element.Tag'>

                 '''

                 对过滤后的标签结合进行遍历。

                 '''

                 for j in alls:

                      #print(j)   #---div-article标签下所以内容。包含标签在内

                      '''

                      #接下来需要将图片进行替换成本地图片:

                      #第一步：将本页的图片按原名下载下来

                      #第二步，替换标签<img>中src的来源路径即可

                      '''

                      if j in soup.findAll("img"):

                           #获取图片的连接url

                           imgAllName=str(j.attrs["src"])

                           #图片名称

                           imgName=imgAllName.split("/")[-1].split(".")[0]#aaa.jpg格式--aaa

                           #图片后缀

                           imgName_suffix=imgAllName.split("/")[-1].split(".")[-1]#类似jpg

                           #将图片存入本地文件-由于网址中src缺少"http:"，所以需要添加形成完整url路径

                           urllib.request.urlretrieve("http:"+imgAllName,"//opt//zc//img//"+imgName+"."+imgName_suffix)

                           '''

                           #设置新的图片（目的：将本地图片地址去替换原来的链接地址）-本地图片的位置和图片名称链接

                           '''

                           imglink="http:/"+"/opt/zc/img/"+imgName+"."+imgName_suffix

                           '''

                           #修改：图片位置存放链接,将本地图片地址去替换原来的链接地址

                           #j.attrs["src"]通过标签的attrs属性获取里面的属性值

                           '''

                           j.attrs["src"]=imglink 

                      #此添加的就是仅仅修改图片链接地址后全部的内容。

                      conlist.append((j))

                 '''

                 遍历保存内容的数组，将其保存为一个整体

                 '''

                 for i in range(len(conlist)):

                     strcon+=str(conlist[i]) #str:将标签内容转换成string类型

                 #print(strcon)#内容

                 '''

                 #以下是ES存储的时候的表字段

                 '''

                 #存储的新闻类型----用于数据库表中的字段

                 #theme_id=request.values.get("theme_id")#----guonei--新闻类型

                 #autoid 主键

                 #company_id="" # 公司ID

                 #title:标题-title

                 #content:内容-strcon

                 '''

                 对es中的索引进行判断，查看是否存在

                 如果不存在，那么就创建，将id的值赋值为1，然后添加数据

                 '''

                 if es.indices.exists(index=theme_id) is not True:

                     autoid=1 #设置es中id的值

                     #将结果写入ES

                     data={"id":autoid,"title":title,

                           "content":strcon

                           }#str(conlist)转换成str类型 否则序列化失败

                     #创建新索引---注意位置在此！

                     es.indices.create(index=theme_id)

                     p.insert_result(theme_id,"sina",data)

                     '''

                     如果索引存在的话，先查询出索引中所有内容，然后将数据进行转换成dataframe

                     然后去获取其中关于标题：title和主键:id的值。

                     '''

                 else:

                     '''

                          #去重，先去查询ES,如果title不相同的，那么继续执行，否则跳出此循环，pass,continue

                          继续执行的前提下，查询ES中的id的最大值，然后每次存储，id+1

                     '''  

                     res0=p.search(theme_id,"sina")#查询方法

                     res1=p.clean_data(res0)#对数据进行转换成：dataFrame

                     res_title=res1[["title"]]#获取需要的部分数据dataframe--

                     res_id=res1[["id"]]#获取需要的部分数据dataframe

                     #print(res_title.values)#---[[],[]]:二维数组

                     titles=[]#存储es中已经存在的title的集合

                     for i in res_title.values:

                         #print(i)#[] [] []

                         for j in i:

                             titles.append(j)

                     '''

                     以上，标题titles保存的就是一维数组

                     '''

                     ids=[]#存储es中已经存在的id的集合

                     for i in res_id.values:

                                 #print(i)#[] [] []

                                 for j in i:

                                     ids.append(j)

                     '''

                     以上，主键ids:保存的就是一维数组

                     '''

                     '''

                     由于每次插入数据id自动增加，所以先要去查看es中（ids）的最大值，然后每加一条记录+1。

                     #对ids数组进行遍历，获取最大值，#max(ids)是：<class 'numpy.int64'>，要转换成int类型

                     #print(type(ids))#list类型

                     '''

                     autoid_max=int(max(ids))

                     #设置es中id的值，自动增加1

                     #print(type(autoid_max))#int 类型

                     autoid=int(autoid_max+1)

                     '''

                     去重处理。

                     #print(titles)#titles:['','','']

                     #对title的值进行判断是否存在。--去重！！！！！，如果存在那么跳出本次循环

                     '''

                     if title in titles:

                            #pass

                            #continue

                            print("已经存在此标题")

                     else:

                             data={"id":autoid,"title":title,"content":strcon

                                   }

                             #调用方法，往es中插入数据

                             p.insert_result(theme_id,"sina",data)

                 #seach获取---本地打印

                 res00=p.search(theme_id,"sina")#查询方法

                 print(res00)

                 return "ok" 

    except urllib.error.URLError as e :

        if hasattr(e,"code"):

            print("==="+e.code)

            return e.code

        if hasattr(e,"reason"):

            print("----"+e.reason)

            return e.reason

    except Exception as e:

        print("exception:"+str(e))

        return e

server.run(host='0.0.0.0',port=8000,debug=True)

以上Linux脚本及Python爬虫脚本，作者在此感谢我的同事张超提供。下一篇博客作者将为大家提供后台java代码（https://www.cnblogs.com/chenyuanbo/p/9973769.html）。此篇博客主要为Python爬虫都是自己书写的小伙伴参考

秒客网

java调用Linux执行Python爬虫，并将数据存储到elasticsearch--（环境脚本搭建）

相关文章