关于hadoop:Hadoop-入门笔记-二十三-MapReduce性能优化数据压缩优化

一. 压缩优化设计

运行MapReduce程序时，磁盘I/O操作、网络数据传输、shuffle和merge要花大量的工夫，尤其是数据规模很大和工作负载密集的状况下，鉴于磁盘I/O和网络带宽是Hadoop的贵重资源，数据压缩对于节俭资源、最小化磁盘I/O和网络传输十分有帮忙。如果磁盘I/O和网络带宽影响了MapReduce作业性能，在任意MapReduce阶段启用压缩都能够改善端到端解决工夫并缩小I/O和网络流量。

压缩是mapreduce的一种优化策略：通过压缩编码对mapper或者reducer的输入进行压缩，
以缩小磁盘IO，进步MR程序运行速度，它的优缺点如下：
压缩的长处:

缩小文件存储所占空间
放慢文件传输效率，从而进步零碎的处理速度
升高IO读写的次数

压缩的毛病

用数据时须要先对文件解压，减轻CPU负荷，压缩算法越简单，解压工夫越长

二. 压缩反对

1. 查看Hadoop反对的压缩算法：hadoop checknative

2. Hadoop反对的压缩算法

3. 各压缩算法压缩性能比照

压缩算法	长处	毛病
Gzip	压缩比在四种压缩形式中较高；hadoop自身反对，在利用中解决gzip格局的文件就和间接解决文本一样；有hadoop native库；大部分linux零碎都自带gzip命令，使用方便	不反对split
Lzo	压缩/解压速度也比拟快，正当的压缩率；反对split，是hadoop中最风行的压缩格局；反对hadoop native库；须要在linux零碎下自行装置lzop命令，使用方便	压缩率比gzip要低；hadoop自身不反对，须要装置；lzo尽管反对split，但须要对lzo文件建索引，否则hadoop也是会把lzo文件看成一个一般文件（为了反对split须要建索引，须要指定inputformat为lzo格局）
Bzip2	反对split；具备很高的压缩率，比gzip压缩率都高；hadoop自身反对，但不反对native；在linux零碎下自带bzip2命令，使用方便	缩/解压速度慢；不反对native
Snappy	压缩速度快；反对hadoop native库	不反对split；压缩比低；hadoop自身不反对，须要装置；linux零碎下没有对应的命令

4. 同样大小的数据对应压缩比

5. 压缩工夫和解压工夫

从以上比照能够看出：压缩比越高，压缩工夫越长，该当抉择压缩比与压缩工夫中等的压缩算法

三.Gzip压缩

1. 生成Gzip压缩文件

1. 需要:读取一般文本文件，将一般文本文件压缩为Gzip格局

2. 思路

Input读取一般文本文件
Map和Reduce间接输入
配置Output输
出压缩为Gzip格局

3.代码实现

import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import java.io.IOException;/** * @ClassName MRWriteGzip * @Description TODO 读取一般文件数据，对数据以Gzip格局进行压缩 */public class MRWriteGzip extends Configured implements Tool {    //构建、配置、提交一个 MapReduce的Job    public int run(String[] args) throws Exception {        //构建Job        Job job = Job.getInstance(this.getConf(),this.getClass().getSimpleName());        job.setJarByClass(MRWriteGzip.class);        //input：配置输出        Path inputPath = new Path(args[0]);        TextInputFormat.setInputPaths(job,inputPath);        //map：配置Map        job.setMapperClass(MrMapper.class);        job.setMapOutputKeyClass(NullWritable.class);        job.setMapOutputValueClass(Text.class);        //reduce：配置Reduce        job.setReducerClass(MrReduce.class);        job.setOutputKeyClass(NullWritable.class);        job.setOutputValueClass(Text.class);   //output：配置输入        Path outputPath = new Path(args[1]);        TextOutputFormat.setOutputPath(job,outputPath);        return job.waitForCompletion(true) ? 0 : -1;    }    //程序入口，调用run    public static void main(String[] args) throws Exception {        //用于治理以后程序的所有配置        Configuration conf = new Configuration();        //配置输入后果压缩为Gzip格局        conf.set("mapreduce.output.fileoutputformat.compress","true");        conf.set("mapreduce.output.fileoutputformat.compress.codec","org.apache.hadoop.io.compress.GzipCodec");        //调用run办法，提交运行Job        int status = ToolRunner.run(conf, new MRWriteGzip(), args);        System.exit(status);    }    /**     * 定义Mapper类     */    public static class MrMapper extends Mapper<LongWritable, Text, NullWritable, Text>{        private NullWritable outputKey = NullWritable.get();        @Override        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {            //间接输入每条数据            context.write(this.outputKey,value);        }    }    /**     * 定义Reduce类     */    public static class MrReduce extends Reducer<NullWritable,Text,NullWritable, Text> {        @Override        protected void reduce(NullWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {            //间接输入每条数据            for (Text value : values) {                context.write(key, value);            }        }    }}

2. 读取Gzip压缩文件

1. 需要：读取Gzip压缩文件，还原为一般文本文件

2. 思路

Input间接读取上一步的压缩后果文件
Map和Reduce间接输入

Output将后果保留为一般文本文件

3.代码开发

import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import java.io.IOException;/** * @ClassName MRReadGzip * @Description TODO 读取Gzip格局的数据，还原为一般文本文件 */public class MRReadGzip extends Configured implements Tool { //构建、配置、提交一个 MapReduce的Job public int run(String[] args) throws Exception {     //构建Job     Job job = Job.getInstance(this.getConf(),this.getClass().getSimpleName());     job.setJarByClass(MRReadGzip.class);     //input：配置输出     Path inputPath = new Path(args[0]);     TextInputFormat.setInputPaths(job,inputPath);     //map：配置Map     job.setMapperClass(MrMapper.class);     job.setMapOutputKeyClass(NullWritable.class);     job.setMapOutputValueClass(Text.class);     //reduce：配置Reduce     job.setReducerClass(MrReduce.class);     job.setOutputKeyClass(NullWritable.class);     job.setOutputValueClass(Text.class);     //output：配置输入     Path outputPath = new Path(args[1]);     TextOutputFormat.setOutputPath(job,outputPath);     return job.waitForCompletion(true) ? 0 : -1; } //程序入口，调用run public static void main(String[] args) throws Exception {     //用于治理以后程序的所有配置     Configuration conf = new Configuration();     //配置输入后果压缩为Gzip格局//        conf.set("mapreduce.output.fileoutputformat.compress","true");//        conf.set("mapreduce.output.fileoutputformat.compress.codec","org.apache.hadoop.io.compress.GzipCodec"); //调用run办法，提交运行Job     int status = ToolRunner.run(conf, new MRReadGzip(), args);     System.exit(status); } /**  * 定义Mapper类  */ public static class MrMapper extends Mapper<LongWritable, Text, NullWritable, Text>{     private NullWritable outputKey = NullWritable.get();     @Override     protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {         //间接输入每条数据         context.write(this.outputKey,value);     } } /**  * 定义Reduce类  */ public static class MrReduce extends Reducer<NullWritable, Text,NullWritable, Text> {     @Override     protected void reduce(NullWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {         //间接输入每条数据         for (Text value : values) {             context.write(key, value);         }     } }}

三. Snappy压缩

1. 配置Hadoop反对Snappy

Hadoop反对Snappy类型的压缩算法，并且也是最罕用的一种压缩算法，然而Hadoop官网已编译的安装包中并没有提供Snappy的反对，所以如果想应用Snappy压缩，必须下载Hadoop源码，本人进行编译，在编译时增加Snappy的反对，具体编译过程请参考《Hadoop3编译装置》手册。

2. 生成Snappy压缩文件：Map输入不压缩

1. 需要：读取一般文本文件，转换为Snappy压缩文件

2. 思路

Input读取一般文本文件
Map和Reduce间接输入

Output配置输入压缩为Snappy类型

3. 代码开发

import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import java.io.IOException;/** * @ClassName MRWriteSnappy * @Description TODO 读取一般文件数据，对数据以Snappy格局进行压缩 */public class MRWriteSnappy extends Configured implements Tool { //构建、配置、提交一个 MapReduce的Job public int run(String[] args) throws Exception {     //构建Job     Job job = Job.getInstance(this.getConf(),this.getClass().getSimpleName());     job.setJarByClass(MRWriteSnappy.class);     //input：配置输出     Path inputPath = new Path(args[0]);     TextInputFormat.setInputPaths(job,inputPath);   //map：配置Map     job.setMapperClass(MrMapper.class);     job.setMapOutputKeyClass(NullWritable.class);     job.setMapOutputValueClass(Text.class);     //reduce：配置Reduce     job.setReducerClass(MrReduce.class);     job.setOutputKeyClass(NullWritable.class);     job.setOutputValueClass(Text.class);     //output：配置输入     Path outputPath = new Path(args[1]);     TextOutputFormat.setOutputPath(job,outputPath);     return job.waitForCompletion(true) ? 0 : -1; } //程序入口，调用run public static void main(String[] args) throws Exception {     //用于治理以后程序的所有配置     Configuration conf = new Configuration();     //配置输入后果压缩为Snappy格局     conf.set("mapreduce.output.fileoutputformat.compress","true");     conf.set("mapreduce.output.fileoutputformat.compress.codec","org.apache.hadoop.io.compress.SnappyCodec");     //调用run办法，提交运行Job     int status = ToolRunner.run(conf, new MRWriteSnappy(), args);     System.exit(status); } /**  * 定义Mapper类  */ public static class MrMapper extends Mapper<LongWritable, Text, NullWritable, Text>{     private NullWritable outputKey = NullWritable.get(); @Override     protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {         //间接输入每条数据         context.write(this.outputKey,value);     } } /**  * 定义Reduce类  */ public static class MrReduce extends Reducer<NullWritable, Text,NullWritable, Text> {     @Override     protected void reduce(NullWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {         //间接输入每条数据         for (Text value : values) {             context.write(key, value);         }     } }}

2. 生成Snappy压缩文件：Map输入压缩

1.需要:读取一般文本文件，转换为Snappy压缩文件，并对Map输入的后果应用Snappy压缩

2. 思路: 将上一步的代码中增加Map输入压缩的配置

3. 代码开发

import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import java.io.IOException;/** * @ClassName MRMapOutputSnappy * @Description TODO 读取一般文件数据，对Map输入的数据以Snappy格局进行压缩 */public class MRMapOutputSnappy extends Configured implements Tool {    //构建、配置、提交一个 MapReduce的Job    public int run(String[] args) throws Exception {        //构建Job        Job job = Job.getInstance(this.getConf(),this.getClass().getSimpleName());        job.setJarByClass(MRMapOutputSnappy.class);        //input：配置输出        Path inputPath = new Path(args[0]);        TextInputFormat.setInputPaths(job,inputPath);        //map：配置Map        job.setMapperClass(MrMapper.class);        job.setMapOutputKeyClass(NullWritable.class);        job.setMapOutputValueClass(Text.class);        //reduce：配置Reduce        job.setReducerClass(MrReduce.class);        job.setOutputKeyClass(NullWritable.class);        job.setOutputValueClass(Text.class);        //output：配置输入        Path outputPath = new Path(args[1]);        TextOutputFormat.setOutputPath(job,outputPath);  return job.waitForCompletion(true) ? 0 : -1;    }    //程序入口，调用run    public static void main(String[] args) throws Exception {        //用于治理以后程序的所有配置        Configuration conf = new Configuration();        //配置Map输入后果压缩为Snappy格局        conf.set("mapreduce.map.output.compress","true");        conf.set("mapreduce.map.output.compress.codec","org.apache.hadoop.io.compress.SnappyCodec");        //配置Reduce输入后果压缩为Snappy格局        conf.set("mapreduce.output.fileoutputformat.compress","true");        conf.set("mapreduce.output.fileoutputformat.compress.codec","org.apache.hadoop.io.compress.SnappyCodec");        //调用run办法，提交运行Job        int status = ToolRunner.run(conf, new MRMapOutputSnappy(), args);        System.exit(status);    }    /**     * 定义Mapper类     */    public static class MrMapper extends Mapper<LongWritable, Text, NullWritable, Text>{        private NullWritable outputKey = NullWritable.get();        @Override        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {            //间接输入每条数据            context.write(this.outputKey,value);        }    }    /**     * 定义Reduce类     */   public static class MrReduce extends Reducer<NullWritable, Text,NullWritable, Text> {        @Override        protected void reduce(NullWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {            //间接输入每条数据            for (Text value : values) {                context.write(key, value);            }        }    }}

4. 读取Snappy压缩文件

1. 需要:读取上一步生成的Snappy文件，还原为一般文本文件

2. 思路:

Input读取Snappy文件
Map和Reduce间接输入

Output间接输入为一般文本类型

3. 代码:

import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import java.io.IOException;/** * @ClassName MRReadSnappy * @Description TODO 读取Snappy格局的数据，还原为一般文本文件 */public class MRReadSnappy extends Configured implements Tool { //构建、配置、提交一个 MapReduce的Job public int run(String[] args) throws Exception {     //构建Job     Job job = Job.getInstance(this.getConf(),this.getClass().getSimpleName());     job.setJarByClass(MRReadSnappy.class);     //input：配置输出     Path inputPath = new Path(args[0]);     TextInputFormat.setInputPaths(job,inputPath);     //map：配置Map     job.setMapperClass(MrMapper.class);     job.setMapOutputKeyClass(NullWritable.class);     job.setMapOutputValueClass(Text.class);     //reduce：配置Reduce     job.setReducerClass(MrReduce.class);     job.setOutputKeyClass(NullWritable.class);     job.setOutputValueClass(Text.class);     //output：配置输入     Path outputPath = new Path(args[1]);     TextOutputFormat.setOutputPath(job,outputPath);     return job.waitForCompletion(true) ? 0 : -1; } //程序入口，调用run public static void main(String[] args) throws Exception {     //用于治理以后程序的所有配置     Configuration conf = new Configuration();     //调用run办法，提交运行Job     int status = ToolRunner.run(conf, new MRReadSnappy(), args); System.exit(status); } /**  * 定义Mapper类  */ public static class MrMapper extends Mapper<LongWritable, Text, NullWritable, Text>{     private NullWritable outputKey = NullWritable.get();     @Override     protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {         //间接输入每条数据         context.write(this.outputKey,value);     } } /**  * 定义Reduce类  */ public static class MrReduce extends Reducer<NullWritable, Text,NullWritable, Text> {     @Override     protected void reduce(NullWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {         //间接输入每条数据         for (Text value : values) {             context.write(key, value);         }     } }}

五. Lzo压缩

1. 配置Hadoop反对Lzo

Hadoop自身不反对Lzo类型的压缩，须要额定独自装置，并在编译时增加Lzo的压缩算法反对，编译过程请参考编译手册《Apache Hadoop3-1-3编译装置部署lzo压缩指南》。
编译实现后，请实现以下配置，让以后的Hadoop反对Lzo压缩

增加lzo反对jar包

cp hadoop-lzo-0.4.21-SNAPSHOT.jar /export/server/hadoop-3.1.4/share/hadoop/common/

同步到所有节点

cd  /export/server/hadoop-3.1.4/share/hadoop/common/scp hadoop-lzo-0.4.21-SNAPSHOT.jar node2:$PWDscp hadoop-lzo-0.4.21-SNAPSHOT.jar node3:$PWD

批改core-site.xml

<property> <name>io.compression.codecs</name><value>org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec,com.hadoop.compression.lzo.LzoCodec,com.hadoop.compression.lzo.LzopCodec</value></property><property> <name>io.compression.codec.lzo.class</name> <value>com.hadoop.compression.lzo.LzoCodec</value></property>

同步core-site.xml到其余所有节点

cd  /export/server/hadoop-3.1.4/etc/hadoopscp  core-site.xml node2:$PWDscp  core-site.xml node3:$PWD

重新启动Hadoop集群

2.生成Lzo压缩文件

1. 需要：读取一般文本文件，生成Lzo压缩后果文件

2. 思路

读取一般文本文件
Map和Reduce间接输入

配置Output输入压缩为Lzo类型

3. 代码开发

import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import java.io.IOException;/** * @ClassName MRWriteLzo * @Description TODO 读取一般文件数据，对数据以Lzo格局进行压缩 */public class MRWriteLzo extends Configured implements Tool { //构建、配置、提交一个 MapReduce的Job public int run(String[] args) throws Exception {     //构建Job     Job job = Job.getInstance(this.getConf(),this.getClass().getSimpleName());     job.setJarByClass(MRWriteLzo.class);     //input：配置输出     Path inputPath = new Path(args[0]);     TextInputFormat.setInputPaths(job,inputPath);  //map：配置Map     job.setMapperClass(MrMapper.class);     job.setMapOutputKeyClass(NullWritable.class);     job.setMapOutputValueClass(Text.class);     //reduce：配置Reduce     job.setReducerClass(MrReduce.class);     job.setOutputKeyClass(NullWritable.class);     job.setOutputValueClass(Text.class);     //output：配置输入     Path outputPath = new Path(args[1]);     TextOutputFormat.setOutputPath(job,outputPath);     return job.waitForCompletion(true) ? 0 : -1; } //程序入口，调用run public static void main(String[] args) throws Exception {     //用于治理以后程序的所有配置     Configuration conf = new Configuration();     //配置输入后果压缩为Lzo格局     conf.set("mapreduce.output.fileoutputformat.compress","true");     conf.set("mapreduce.output.fileoutputformat.compress.codec","com.hadoop.compression.lzo.LzopCodec");     //调用run办法，提交运行Job     int status = ToolRunner.run(conf, new MRWriteLzo(), args);     System.exit(status); } /**  * 定义Mapper类  */ public static class MrMapper extends Mapper<LongWritable, Text, NullWritable, Text>{     private NullWritable outputKey = NullWritable.get();     @Override     protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {         //间接输入每条数据         context.write(this.outputKey,value);     } } /**  * 定义Reduce类  */ public static class MrReduce extends Reducer<NullWritable, Text,NullWritable, Text> {     @Override     protected void reduce(NullWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {         //间接输入每条数据         for (Text value : values) {             context.write(key, value);         }     } }}

3. 读取Lzo压缩文件

1. 需要：读取Lzo压缩文件，复原为一般文本文件

2. 代码开发

import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import java.io.IOException;/** * @ClassName MRReadLzo * @Description TODO 读取Lzo格局的数据，还原为一般文本文件 */public class MRReadLzo extends Configured implements Tool { //构建、配置、提交一个 MapReduce的Job public int run(String[] args) throws Exception {     //构建Job     Job job = Job.getInstance(this.getConf(),this.getClass().getSimpleName());     job.setJarByClass(MRReadLzo.class);     //input：配置输出     Path inputPath = new Path(args[0]);     TextInputFormat.setInputPaths(job,inputPath);     //map：配置Map     job.setMapperClass(MrMapper.class);     job.setMapOutputKeyClass(NullWritable.class);     job.setMapOutputValueClass(Text.class);     //reduce：配置Reduce     job.setReducerClass(MrReduce.class);     job.setOutputKeyClass(NullWritable.class);     job.setOutputValueClass(Text.class);     //output：配置输入     Path outputPath = new Path(args[1]);     TextOutputFormat.setOutputPath(job,outputPath);     return job.waitForCompletion(true) ? 0 : -1; }  //程序入口，调用run public static void main(String[] args) throws Exception {     //用于治理以后程序的所有配置     Configuration conf = new Configuration();     //配置输入后果压缩为Gzip格局//        conf.set("mapreduce.output.fileoutputformat.compress","true");//        conf.set("mapreduce.output.fileoutputformat.compress.codec","org.apache.hadoop.io.compress.GzipCodec");     //调用run办法，提交运行Job     int status = ToolRunner.run(conf, new MRReadLzo(), args);     System.exit(status); } /**  * 定义Mapper类  */ public static class MrMapper extends Mapper<LongWritable, Text, NullWritable, Text>{     private NullWritable outputKey = NullWritable.get();     @Override     protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {         //间接输入每条数据         context.write(this.outputKey,value);     } } /**  * 定义Reduce类  */ public static class MrReduce extends Reducer<NullWritable, Text,NullWritable, Text> {     @Override     protected void reduce(NullWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {         //间接输入每条数据  for (Text value : values) {             context.write(key, value);         }     } }}