mapreduce程序编写(WordCount)

折腾了半天。终于编写成功了第一个自己的mapreduce程序，并通过打jar包的方式运行起来了。

运行环境：

windows 64bit

eclipse 64bit

jdk6.0 64bit

一、工程准备

1、新建java project

2、导入jar包

新建一个user library 把hadoop文件夹里的hadoop-core和lib包里的所有包都导入进来，以免出错。

二、编码

1、主要是计算单词的小程序，测试用

package com.hirra;

import java.io.IOException;

import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount {

    //嵌套类 Mapper

    //Mapper<keyin,valuein,keyout,valueout>

    public static class WordCountMapper extends Mapper<Object, Text, Text, IntWritable>{

        private final static IntWritable one = new IntWritable(1);

        private Text word = new Text();  

        @Override

        protected void map(Object key, Text value, Context context)

                throws IOException, InterruptedException {

            StringTokenizer itr = new StringTokenizer(value.toString());

            while(itr.hasMoreTokens()){

                word.set(itr.nextToken());

                context.write(word, one);//Context机制

            }

        }

    }  

    //嵌套类Reducer

    //Reduce<keyin,valuein,keyout,valueout>

    //Reducer的valuein类型要和Mapper的va lueout类型一致,Reducer的valuein是Mapper的valueout经过shuffle之后的值

    public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

        private IntWritable result = new IntWritable();  

        @Override

        protected void reduce(Text key, Iterable<IntWritable> values,

                Context context)

                throws IOException, InterruptedException {

            int sum  = 0;

            for(IntWritable i:values){

                sum += i.get();

            }

            result.set(sum);

            context.write(key,result);//Context机制

        }  

    }  

    public static void main(String[] args) throws Exception{

        Configuration conf = new Configuration();//获得Configuration配置 Configuration: core-default.xml, core-site.xml　
　　　　　//很关键
　　　　 conf.set("mapred.job.tracker", "hadoopmaster:9001");

　　　　String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();//获得输入参数[hdfs://localhost:9000/user/dat/input, hdfs://localhost:9000/user/dat/output]

        if(otherArgs.length != 2){//判断输入参数个数，不为两个异常退出

            System.err.println("Usage:wordcount <in> <out>");

            System.exit(2);

        }  

        ////设置Job属性

        Job job = new Job(conf,"word count");

        job.setJarByClass(WordCount.class);

        job.setMapperClass(WordCountMapper.class);

        job.setCombinerClass(WordCountReducer.class);//将结果进行局部合并

        job.setReducerClass(WordCountReducer.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(IntWritable.class);  

        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));//传入input path

        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));//传入output path，输出路径应该为空，否则报错org.apache.hadoop.mapred.FileAlreadyExistsException。  

        System.exit(job.waitForCompletion(true)?0:1);//是否正常退出

    }

}

2、注意问题

有些jar包没导入会出现问题

三、生成jar包

1、eclipse自带功能export jar包

四、运行

1、ssh client工具导入至linux

2、hadoop运行,转到hadoop的bin目录下，执行下面指令:

./hadoop jar test.jar /README.txt /usr/dat/output

3、注意问题

output目录必须是之前不存在的路径。

秒客网

mapreduce程序编写(WordCount)

一、工程准备

二、编码

相关文章