本文实例讲述了python自定义解析简单xml格式文件的方法。分享给大家供大家参考。具体分析如下:
因为公司内部的接口返回的字串支持2种形式:php数组,xml;结果php数组python不能直接用,而xml字符串的格式不是标准的,所以也不能用标准模块解析。【不标准的地方是某些节点会的名称是以数字开头的】,所以写个简单的脚步来解析一下文件,用来做接口测试。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
|
#!/usr/bin/env python
#encoding: utf-8
import re
class xmlparse:
def __init__( self , xmlstr):
self .xmlstr = xmlstr
self .xmldom = self .__convet2utf8()
self .xmlnodelist = []
self .xpath = ''
def __convet2utf8( self ):
headstr = self .__get_head()
xmldomstr = self .xmlstr.replace(headstr, '')
if 'gbk' in headstr:
xmldomstr = xmldomstr.decode( 'gbk' ).encode( 'utf-8' )
elif 'gb2312' in headstr:
xmldomstr = self .xmlstr.decode( 'gb2312' ).encode( 'utf-8' )
return xmldomstr
def __get_head( self ):
headpat = r '<\?xml.*\?>'
headpatobj = re. compile (headpat)
headregobj = headpatobj.match( self .xmlstr)
if headregobj:
headstr = headregobj.group()
return headstr
else :
return ''
def parse( self , xpath):
self .xpath = xpath
xpatlist = []
xpatharr = self .xpath.split( '/' )
for xnode in xpatharr:
if xnode:
spcindex = xnode.find( '[' )
if spcindex > - 1 :
index = int (xnode[spcindex + 1 : - 1 ])
xnode = xnode[:spcindex]
else :
index = 0 ;
temppat = ( '<%s>(.*?)</%s>' % (xnode, xnode),index)
xpatlist.append(temppat)
xmlnodestr = self .xmldom
for xpat,index in xpatlist:
xmlnodelist = re.findall(xpat,xmlnodestr)
xmlnodestr = xmlnodelist[index]
if xmlnodestr.startswith(r '<![CDATA[' ):
xmlnodestr = xmlnodestr.replace(r '<![CDATA[' ,'')[: - 3 ]
self .xmlnodelist = xmlnodelist
return xmlnodestr
if '__main__' = = __name__:
xmlstr = '<?xml version="1.0" encoding="utf-8" standalone="yes" ?><resultObject><a><product_id>aaaaa</product_id><product_name><![CDATA[bbbbb]]></a><b><product_id>bbbbb</product_id><product_name><![CDATA[bbbbb]]></b></product_name></resultObject>'
xpath1 = '/product_id'
xpath2 = '/product_id[1]'
xpath3 = '/a/product_id'
xp = xmlparse(xmlstr)
print 'xmlstr:' ,xp.xmlstr
print 'xmldom:' ,xp.xmldom
print '------------------------------'
getstr = xp.parse(xpath1)
print 'xpath:' ,xp.xpath
print 'get list:' ,xp.xmlnodelist
print 'get string:' , getstr
print '------------------------------'
getstr = xp.parse(xpath2)
print 'xpath:' ,xp.xpath
print 'get list:' ,xp.xmlnodelist
print 'get string:' , getstr
print '------------------------------'
getstr = xp.parse(xpath3)
print 'xpath:' ,xp.xpath
print 'get list:' ,xp.xmlnodelist
print 'get string:' , getstr
|
运行结果:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
xmlstr: <?xml version = "1.0" encoding = "utf-8" standalone = "yes" ?><resultObject><a><product_id>aaaaa< / product_id><product_name><![CDATA[bbbbb]]>< / a><b><product_id>bbbbb< / product_id><product_name><![CDATA[bbbbb]]>< / b>< / product_name>< / resultObject>
xmldom: <resultObject><a><product_id>aaaaa< / product_id><product_name><![CDATA[bbbbb]]>< / a><b><product_id>bbbbb< / product_id><product_name><![CDATA[bbbbb]]>< / b>< / product_name>< / resultObject>
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
xpath: / product_id
get list : [ 'aaaaa' , 'bbbbb' ]
get string: aaaaa
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
xpath: / product_id[ 1 ]
get list : [ 'aaaaa' , 'bbbbb' ]
get string: bbbbb
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
xpath: / a / product_id
get list : [ 'aaaaa' ]
get string: aaaaa
|
因为返回的xml格式比较简单,没有带属性的节点,所以处理起来就比较简单了。但测试还是发现有一个bug。即当相同节点嵌套时会出现正则匹配出问题,该问题的可以通过避免在xpath中出现有嵌套节点的名称来解决,否则只有重写复杂的机制了。
希望本文所述对大家的Python程序设计有所帮助。