从hdfs里获取希望的数据:
import subprocess for day in range(22, 23):
for h in range(17, 24):
filename = "metadata-2018-10-%02d-%02d.txt" % (day, h)
cmd = "hdfs dfs -text /flume/metadata/2018/10/%02d/%02d/*.snappy" % (day, h)
print(cmd)
#cmd = "cat *.py"
cmd = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
f = open(filename, "w")
for line in cmd.stdout:
try:
arr = line.split("^")
if len(line) > 100 and arr[6] == "6":
#print(line)
f.write(" ".join(arr[:32]) + " " + arr[95] + "\n")
except Exception as e:
print(e, "fuck error", line)
f.close()
#import sys
#sys.exit(0)