1、将wordcount.txt文本文件上传到/data/目录下,wordcount.txt文件内容如下:
red black green yellow
red blue blue
black big small small yellow
red red red red
blue
2、创建一个java maven工程,pom.xml中添加hdfs、mapreduce的引用,如下
<project xmlns
="http://maven.apache.org/POM/4.0.0" xmlns:xsi
="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation
="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion
>4.0.0
</modelVersion
>
<groupId
>com.che
</groupId
>
<artifactId
>demo
</artifactId
>
<version
>0.0.1-SNAPSHOT
</version
>
<packaging
>jar
</packaging
>
<name
>demo
</name
>
<url
>http://maven.apache.org
</url
>
<properties
>
<project.build.sourceEncoding
>UTF-8
</project.build.sourceEncoding
>
</properties
>
<dependencies
>
<dependency
>
<groupId
>junit
</groupId
>
<artifactId
>junit
</artifactId
>
<version
>3.8.1
</version
>
<scope
>test
</scope
>
</dependency
>
<dependency
>
<groupId
>org.apache.hadoop
</groupId
>
<artifactId
>hadoop-common
</artifactId
>
<version
>2.7.0
</version
>
</dependency
>
<dependency
>
<groupId
>org.apache.hadoop
</groupId
>
<artifactId
>hadoop-hdfs
</artifactId
>
<version
>2.7.0
</version
>
</dependency
>
<dependency
>
<groupId
>org.apache.hadoop
</groupId
>
<artifactId
>hadoop-mapreduce-client-core
</artifactId
>
<version
>2.7.0
</version
>
</dependency
>
<dependency
>
<groupId
>org.apache.hadoop
</groupId
>
<artifactId
>hadoop-client
</artifactId
>
<version
>2.7.0
</version
>
</dependency
>
</dependencies
>
</project
>
3、代码如下:
3.1 WordCount Mapper实现类WordCountMapper.java
package com.che.demo.mapreduce
;
import java.io.IOException
;
import org.apache.hadoop.io.LongWritable
;
import org.apache.hadoop.io.Text
;
import org.apache.hadoop.mapreduce.Mapper
;
/**
* WordCount Mapper实现类
*/
public class WordCountMapper extends Mapper
<LongWritable, Text,Text,LongWritable
> {
@Override
protected void map
(LongWritable key, Text value, Context context
) throws IOException, InterruptedException
{
// 将Text类型的value 转换成 string
String datas
= value.toString
();
// 将这一行用
" " 切分出各个单词
String
[] words
= datas.split
(" ");
for (String word
: words
) {
context.write
(new Text
(word
),new LongWritable
(1
));
}
}
}
3.2 WordCount Reducer实现类WordCountReducer.java
package com.che.demo.mapreduce
;
import java.io.IOException
;
import java.util.Iterator
;
import org.apache.hadoop.io.IntWritable
;
import org.apache.hadoop.io.Text
;
import org.apache.hadoop.mapreduce.Reducer
;
/**
* WordCount Reducer实现类
*/
public class WordCountReducer extends Reducer
<Text, IntWritable, Text, IntWritable
>{
@Override
protected void reduce
(Text word, Iterable
<IntWritable
> valuesIterator,Context context
)throws IOException, InterruptedException
{
int count
=0
;
//统计单词数量
Iterator
<IntWritable
> iterator
= valuesIterator.iterator
();
while
(iterator.hasNext
()){
iterator.next
();
count++
;
}
context.write
(word, new IntWritable
(count
));
}
}
3.3 WordCount Main方法实现类WordCountJob.java
package com.che.demo.mapreduce
;
import org.apache.hadoop.conf.Configuration
;
import org.apache.hadoop.fs.Path
;
import org.apache.hadoop.io.LongWritable
;
import org.apache.hadoop.io.Text
;
import org.apache.hadoop.mapreduce.Job
;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
;
import java.io.IOException
;
/**
* WordCount Main方法实现类
*/
public class WordCountJob
{
public static void main
(String
[] args
) throws IOException, ClassNotFoundException, InterruptedException
{
Configuration conf
= new Configuration
();
Job wcjob
= Job.getInstance
(conf
);
wcjob.setJarByClass
(WordCountJob.class
);
wcjob.setMapperClass
(WordCountMapper.class
);
wcjob.setReducerClass
(WordCountReducer.class
);
wcjob.setMapOutputKeyClass
(Text.class
);
wcjob.setMapOutputValueClass
(LongWritable.class
);
wcjob.setMapOutputKeyClass
(Text.class
);
wcjob.setOutputValueClass
(LongWritable.class
);
//指定要处理的数据所在的位置
FileInputFormat.setInputPaths
(wcjob,
"/user/che/1021001/input");
//指定处理完成之后的结果所保存的位置
FileOutputFormat.setOutputPath
(wcjob, new Path
("/user/che/1021001/output"));
// 向yarn集群提交这个job
boolean res
= wcjob.waitForCompletion
(true
);
System.out.println
(res?0:1
);
}
}
3.4 在eclipse中将项目工程打成一个jar包,操作如下图
然后工程target目录就会多出一个jar文件如下图
3.5 将demo-0.0.1-SNAPSHOT.jar重命名为demo.jar后,上传到centos7上的/data/目录下
3.6 将centos7中的/data/wordcount.txt文件上传hdfs上的/user/che/1021001/input目录下
hdfs dfs -put /data/wordcount.txt /user/che/1021001/input
3.7 使用hadoop jar命令执行
hadoop jar /data/demo.jar com.che.demo.mapreduce.WordCountJob
3.8 查看输出结果,输出结果在/user/che/1021001/output