【hadoop】python通过hdfs模块读hdfs数据

hdfs官网：http://hdfscli.readthedocs.io/en/latest/api.html

一个非常好的博客：http://blog.csdn.net/gamer_gyt/article/details/52446757

hdfs库中自带avro序列化与反序列化模块，不需要单独做

#!/usr/bin/env python

# encoding: utf-8

"""Avro extension example."""

from hdfs import Config

from hdfs.ext.avro import AvroReader, AvroWriter

# Get the default alias' client.

client = Config().get_client()

# Some sample data.

records = [

  {'name': 'Ann', 'age': 23},

  {'name': 'Bob', 'age': 22},

]

# Write an Avro File to HDFS (since our records' schema is very simple, we let

# the writer infer it automatically, otherwise we would pass it as argument).

with AvroWriter(client, 'names.avro', overwrite=True) as writer:

  for record in records:

    writer.write(record)

# Read it back.

with AvroReader(client, 'names.avro') as reader:

  schema = reader.schema # The inferred schema.

  content = reader.content # The remote file's HDFS content object.

  assert list(reader) == records # The records match!

遍历hdfs目录

from hdfs import *

import os

from hdfs.ext.avro import AvroReader, AvroWriter

def main():

    client=Client("http://127.0.0.1:50070")

    path = "/test/tmp_data"

    for root, dir, files in client.walk(path):

        for file in files:

            full_path = os.path.join(root, file)

            print full_path

            with AvroReader(client, full_path) as reader:

                schema = reader.schema # The inferred schema.

                content = reader.content # The remote file's HDFS content object.

                #assert list(reader) == records

                for user in list(reader):

                    print user

main()

秒客网

【hadoop】python通过hdfs模块读hdfs数据

相关文章