本文实例讲述了Python基于pandas实现json格式转换成dataframe的方法。分享给大家供大家参考,具体如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
# -*- coding:utf-8 -*-
#!python3
import re
import json
from bs4 import BeautifulSoup
import pandas as pd
import requests
import os
from pandas.io.json import json_normalize
class image_structs():
def __init__( self ):
self .picture_url = {
"image_id" : '',
"picture_url" : ''
}
class data_structs():
def __init__( self ):
# columns=['title', 'item_url', 'id','picture_url','std_desc','description','information','fitment'])
self .info = {
"title" :'',
"item_url" :'',
"id" : 0 ,
"picture_url" :[],
"std_desc" :'',
"description" :'',
"information" :'',
"fitment" :''
}
# "https://waldoch.com/store/catalogsearch/result/index/?cat=0&limit=200&p=1&q=nerf+bar"
# https://waldoch.com/store/new-oem-ford-f-150-f150-5-running-boards-nerf-bar-crew-cab-2015-w-brackets-fl34-16451-ge5fm6.html
def get_item_list(outfile):
result = []
for i in range ( 6 ):
print (i)
i = str (i + 1 )
url = "https://waldoch.com/store/catalogsearch/result/index/?cat=0&limit=200&p=" + i + "&q=nerf+bar"
web = requests.get(url)
soup = BeautifulSoup(web.text, "html.parser" )
alink = soup.find_all( "a" , class_ = "product-image" )
for a in alink:
title = a[ "title" ]
item_url = a[ "href" ]
result.append([title,item_url])
df = pd.DataFrame(result,columns = [ "title" , "item_url" ])
df = df.drop_duplicates()
df[ "id" ] = df.index
df.to_excel(outfile,index = False )
def get_item_info( file ,outfile):
DEFAULT_FALSE = ""
df = pd.read_excel( file )
for i in df.index:
id = df.loc[i, "id" ]
if os.path.exists( str ( int ( id )) + ".xlsx" ):
continue
item_url = df.loc[i, "item_url" ]
url = item_url
web = requests.get(url)
soup = BeautifulSoup(web.text, "html.parser" )
# 图片
imglink = soup.find_all( "img" , class_ = re. compile ( "^gallery-image" ))
data = data_structs()
data.info[ "title" ] = df.loc[i, "title" ]
data.info[ "id" ] = id
data.info[ "item_url" ] = item_url
for a in imglink:
image = image_structs()
image.picture_url[ "image_id" ] = a[ "id" ]
image.picture_url[ "picture_url" ] = a[ "src" ]
print (image.picture_url)
data.info[ "picture_url" ].append(image.picture_url)
print (data.info)
# std_desc
std_desc = soup.find( "div" , itemprop = "description" )
try :
strings_desc = []
for ii in std_desc.stripped_strings:
strings_desc.append(ii)
strings_desc = "\n" .join(strings_desc)
except :
strings_desc = DEFAULT_FALSE
# description
try :
desc = soup.find( 'h2' , text = "Description" )
desc = desc.find_next()
except :
desc = DEFAULT_FALSE
description = desc
# information
try :
information = soup.find( "h2" , text = 'Information' )
desc = information
desc = desc.find_next()
except :
desc = DEFAULT_FALSE
information = desc
# fitment
try :
fitment = soup.find( 'h2' , text = 'Fitment' )
desc = fitment
desc = desc.find_next()
except :
desc = DEFAULT_FALSE
fitment = desc
data.info[ "std_desc" ] = strings_desc
data.info[ "description" ] = str (description)
data.info[ "information" ] = str (information)
data.info[ "fitment" ] = str (fitment)
print (data.info.keys())
singledf = json_normalize(data.info, "picture_url" ,[ 'title' , 'item_url' , 'id' , 'std_desc' , 'description' , 'information' , 'fitment' ])
singledf.to_excel( "test.xlsx" ,index = False )
exit()
# print(df.ix[i])
df.to_excel(outfile,index = False )
# get_item_list("item_urls.xlsx")
get_item_info( "item_urls.xlsx" , "item_urls_info.xlsx" )
|
这里涉及到的几个Python模块都可以使用pip install命令进行安装,如:
1
|
pip install BeautifulSoup4
|
1
|
pip install xlrd
|
1
|
pip install openpyxl
|
希望本文所述对大家Python程序设计有所帮助。
原文链接:https://blog.csdn.net/zn505119020/article/details/78964111