MapReduce的倒排索引

索引：

什么是索引：索引（Index）是帮助数据库高效获取数据的数据结构。索引是在基于数据库表创建的，它包含一个表中某些列的值以及记录对应的地址，并且把这些值存储在一个数据结构中。最常见的就是使用哈希表、B+树作为索引。

索引的具体分析：https ：//blog.****.net/meiLin_Ya/article/details/80854232

用代码说事，先来看看我的数据吧：

MapReduce的倒排索引

包com.huhu.day05;

import java.io.IOException;

导入org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

import com.huhu.day04.ProgenyCount;

公共类InvertedIndex扩展ToolRunner实现工具{

	私人配置conf;

	公共静态类MyMapper扩展Mapper <LongWritable，文本，文本，文本> {

		私人FileSplit拆分;

		private Text va = new Text（）;

		@覆盖

		保护无效设置（Mapper <LongWritable，Text，Text，Text> .Context上下文）

				抛出IOException，InterruptedException {

			split =（FileSplit）context.getInputSplit（）;

		}

		@覆盖

		protected void map（LongWritable key，Text value，Context context）throws IOException，InterruptedException {

			String [] line = value.toString（）。split（“”）;

			通信System.err.println（线）;

			String filename = split.getPath（）。getName（）;

			for（String s：line）{

				va.set（“fileName：”+ filename +“：”+ key.get（）+“\ t索引位置：”+ value.toString（）。indexOf（s）+“\ t”）;

				context.write（new Text（“搜索词：”+ s +“\ r”），new Text（va））;

			}

		}

	}

	公共静态类MyReduce扩展Reducer <文本，文本，文本，文本> {

		@覆盖

		保护无效设置（上下文上下文）抛出IOException，InterruptedException {

		}

		@覆盖

		protected void reduce（Text key，Iterable <Text> values，Context context）

				抛出IOException，InterruptedException {

			StringBuffer sb = new StringBuffer（）;

			for（Text v：values）{

				sb.append（v.toString（））;

			}

			context.write（new Text（key），new Text（sb.toString（）））;

		}

		@覆盖

		保护无效清理（上下文上下文）抛出IOException，InterruptedException {

		}

	}

	公共静态无效的主要（字符串[]参数）抛出异常{

		InvertedIndex t = new InvertedIndex（）;

		配置conf = t.getConf（）;

		String [] other = new GenericOptionsParser（conf，args）.getRemainingArgs（）;

		if（other.length！= 2）{

			System.err.println（“number is fail”）;

		}

		int run = ToolRunner.run（conf，t，args）;

		System.exit（运行）;

	}

	@覆盖

	public Configuration getConf（）{

		if（conf！= null）{

			返回conf;

		}

		返回新的配置（）;

	}

	@覆盖

	public void setConf（Configuration arg0）{

	}

	@覆盖

	公共诠释运行（字符串[]其他）抛出异常{

		配置con = getConf（）;

		Job job = Job.getInstance（con）;

		job.setJarByClass（ProgenyCount.class）;

		job.setMapperClass（MyMapper.class）;

		job.setMapOutputKeyClass（Text.class）;

		job.setMapOutputValueClass（Text.class）;

		//默认分区

		// job.setPartitionerClass（HashPartitioner.class）;

		job.setReducerClass（MyReduce.class）;

		job.setOutputKeyClass（Text.class）;

		job.setOutputValueClass（Text.class）;

		FileInputFormat.addInputPath（job，new Path（“hdfs：// ry-hadoop1：8020 / in / day05 / InvertedIndex”））;

		Path path = new Path（“hdfs：// ry-hadoop1：8020 / out / day05.txt”）;

		FileSystem fs = FileSystem.get（getConf（））;

		if（fs.exists（path））{

			fs.delete（path，true）;

		}

		FileOutputFormat.setOutputPath（job，path）;

		返回job.waitForCompletion（true）？0：1;

	}

}

MapReduce的倒排索引

索引很重要：

详情：https ：//blog.****.net/meiLin_Ya/article/details/80854232

秒客网

MapReduce的倒排索引

相关文章