I have a script that I am writing to parse a csv file.
我有一个脚本,我正在编写解析csv文件。
If a file starts I want to be able to move to the next text line and continue parsing.
如果文件启动,我希望能够移动到下一个文本行并继续解析。
The issue is with my regex which I cannot get to match.
问题在于我的正则表达式,我无法匹配。
if re.match(r'^.\<\!', line):
line.next()
Example text
<!-- Copyright Notice: © 2010 Racing NSW (and other parties working with it). NSW racing information,including fields, form and results, is subject to copyright which is owned by Racing NSW and other parties working with it. -->
Meeting,17/02/16,CANT,Canterbury Park,Weights,TAB,+6m Entire Circuit, ,
Race,1,BENCHMARK 77 HANDICAP,BM77,BM77,1550,BM77 ,3U ,~ ,HCP ,54,0,0,17/02/2016,, , , , ,BenchMark 77, Handicap, For Three-Years-Old and Upwards, No sex restriction,Of $40000. First $23025, second $7925, third $3960, fourth $1885, fifth $955, sixth $450, seventh $450, eighth $450, ninth $450, tenth $450
Horse,1,Balboa Park (NZ),0,"Gai Waterhouse",Randwick,,0,54.5,3-1-1-0 $30000.00,,0,0,0,,65.00,G,
Horse,2,Baylie Louise,0,"Matthew Dale",Canberra,,0,55,16-6-2-4 $112545.00,,0,0,0,,69.00,M,
Horse,3,Beretta,0,"Kris Lees",Broadmeadow,,0,55.5,8-2-1-1 $38305.00,,0,0,0,,66.00,G,
Horse,4,Elle Lou,0,"Chris Waller",Rosehill,,0,57.5,14-2-4-0 $141625.00,,0,0,0,,74.00,M,
Horse,5,Got Unders,0,"Ken Lantry",Broadmeadow,,0,60,33-4-9-9 $140735.00,,0,0,0,,75.00,G,
Horse,6,Lord de Air,0,"Bede Murray",Kembla Grange,,0,57,16-4-2-3 $89050.00,,0,0,0,,69.00,G,
Horse,7,Lucky Liaison,0,"Kristen Buchanan",Wyong,,0,61,49-8-6-8 $257865.00,,0,0,0,,77.00,G,
Horse,8,Makeadane,0,"John P Thompson",Randwick,,0,55,15-2-2-2 $65002.00,,0,0,0,,65.00,G,
Horse,9,Miss Denni (NZ),0,"Chris Waller",Rosehill,,0,57.5,12-2-5-1 $102075.00,,0,0,0,,74.00,M,
Horse,10,Multifacets (NZ),0,"Chris Waller",Rosehill,,0,54,6-1-0-0 $19845.00,,0,0,0,,62.00,C,
Horse,11,Mydream,0,"Melissa Harrison",Kembla Grange,,0,56.5,34-8-2-3 $142520.00,,0,0,0,,72.00,M,
Horse,12,Never Back Down,0,"Jim & Greg Lee",Randwick,,0,58,33-4-3-8 $151090.00,,0,0,0,,71.00,G,
Horse,13,Orcym Sam,0,"Gwenda Markwell",Kembla Grange,,0,59,6-3-2-0 $44350.00,,0,0,0,,73.00,G,
Horse,14,Recife Beach,0,"Kim Waugh",Wyong,,0,57,21-3-5-2 $77175.00,,0,0,0,,69.00,G,
Horse,15,Soros,0,"Joseph Pride",Warwick Farm,,0,60,36-6-2-4 $249975.00,,0,0,0,,75.00,G,
Horse,16,Spiritos,0,"Chris Waller",Rosehill,,0,55.5,8-2-0-1 $45585.00,,0,0,0,,67.00,G,
Horse,17,Ultima Chance,0,"Scott Collings",Goulburn,,0,55,39-9-6-3 $104437.00,,0,0,0,,65.00,G,
Race,2,BENCHMARK 72 HANDICAP,BM72,BM72,1250,BM72 ,3U ,~ ,HCP ,55.5,0,0,17/02/2016,,
This is full file
这是完整的文件
import csv
import re
from sys import argv
SCRIPT, FILENAME = argv
def out_file_name(file_name):
"""take an input file and keep the name with appended _clean"""
file_parts = file_name.split(".",)
output_file = file_parts[0] + '_clean.' + file_parts[1]
return output_file
def race_table(text_file):
"""utility to reorganise poorly made csv entry"""
output_table = []
for line in enumerate(text_file):
if re.match(r'^.\<\!', line):
line.next()
for record in text_file:
if record[0] == 'Meeting':
meeting = record[3]
rail = record[6]
weather = record[7]
track = record[8]
elif record[0] == 'Race':
date = record[13]
race = record[1]
benchmark = record[4]
distance = record[5]
elif record[0] == 'Horse':
number = record[1]
name = record[2]
jockey = record[6]
barrier = record[7]
weight = record[8]
results = record[9]
res_split = re.split('[- ]', results)
starts = res_split[0]
wins = res_split[1]
seconds = res_split[2]
thirds = res_split[3]
try:
prizemoney = res_split[4]
except IndexError:
prizemoney = 0
trainer = record[4]
location = record[5]
b_rating = record[15]
sex = record[16]
print(name, wins, seconds)
output_table.append((meeting, date, rail, weather, track, distance,
benchmark, race, number, name, sex, b_rating,
weight, barrier, starts, wins, seconds,
thirds, prizemoney, trainer, location, jockey
))
return output_table
MY_FILE = out_file_name(FILENAME)
with open(FILENAME, 'r') as f_in, open(MY_FILE, 'w') as f_out:
CONTENT = csv.reader(f_in)
# print(content)
FILE_CONTENTS = race_table(CONTENT)
# print new_name
# f_out.write(str(FILE_CONTENTS))
headers = ['MEETING', 'DATE', 'RAIL', 'WEATHER', 'TRACK', 'DISTANCE',
'BENCHMARK', 'RACE', 'NUMBER', 'NAME', 'SEX', 'B_RATING',
'WEIGHT', 'BARRIER', 'STARTS', 'WINS', 'SECONDS', 'THIRDS',
'PRIZEMONEY', 'TRAINER', 'LOCATION', 'JOCKEY']
f_csv = csv.writer(f_out)
f_csv.writerow(headers)
f_csv.writerows(FILE_CONTENTS)
if __name__ == '__main__':
pass
2 个解决方案
#1
1
Remove the dot at the beginning of the expression:
删除表达式开头的点:
>>> s = "<!-- Copyright Notice: © 2010 Racing NSW (and other parties working with it). NSW racing information,including fields, form and results, is subject to copyright which is owned by Racing NSW and other parties working with it. -->"
>>>
>>> re.match(r'^.\<\!', s)
>>> re.match(r'^\<\!', s)
<_sre.SRE_Match object at 0x10da7fed0>
Alternatively, you can filter the comments earlier when initializing the csv.reader
:
或者,您可以在初始化csv.reader时过滤先前的注释:
with open(FILENAME, 'r') as f_in, open(MY_FILE, 'w') as f_out:
CONTENT = csv.reader(row for row in f_in if not row.startswith('<!--'))
#2
1
if re.match(r'^<!.*', line):
remove the .
in front and add .*
at end.Also there is no need to escape <!
去除 。在前面并添加。*在结尾。也没有必要逃避
#1
1
Remove the dot at the beginning of the expression:
删除表达式开头的点:
>>> s = "<!-- Copyright Notice: © 2010 Racing NSW (and other parties working with it). NSW racing information,including fields, form and results, is subject to copyright which is owned by Racing NSW and other parties working with it. -->"
>>>
>>> re.match(r'^.\<\!', s)
>>> re.match(r'^\<\!', s)
<_sre.SRE_Match object at 0x10da7fed0>
Alternatively, you can filter the comments earlier when initializing the csv.reader
:
或者,您可以在初始化csv.reader时过滤先前的注释:
with open(FILENAME, 'r') as f_in, open(MY_FILE, 'w') as f_out:
CONTENT = csv.reader(row for row in f_in if not row.startswith('<!--'))
#2
1
if re.match(r'^<!.*', line):
remove the .
in front and add .*
at end.Also there is no need to escape <!
去除 。在前面并添加。*在结尾。也没有必要逃避