场景描述:
因技术需要,目前恶补技术问题,先从小小的demon做起,大项目咋这技术也搞不定。
此次计划,首先模拟数据生成,生成后导入hdfs中,继续落盘到hive中进行练习。期间使用到的工具计划包含Hive,HDFS,Flume,Kafka,ELK等。
组件环境 CDH
QQ交流群628465680,新群没几个人,人多了再加微信
1、模拟数据生成,目的提供数据量,使用这些数据去做一些清洗以及传输
创建 Maven 工程 Table_emp_dept
pom.xml
<?xml version
="1.0" encoding
="UTF-8"?>
<project xmlns
="http://maven.apache.org/POM/4.0.0"
xmlns
:xsi
="http://www.w3.org/2001/XMLSchema-instance"
xsi
:schemaLocation
="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion
>
<groupId>org
.example
</groupId
>
<artifactId>Table_emp_dept
</artifactId
>
<version>1.0-SNAPSHOT
</version
>
<dependencies>
<!-- 用于随机生成用户数据
-->
<dependency>
<groupId>com
.github
.binarywang
</groupId
>
<artifactId>java
-testdata
-generator
</artifactId
>
<version>1.1.2</version
>
</dependency
>
<!-- Juint 测试类
-->
<!-- https
://mvnrepository
.com
/artifact
/junit
/junit
-->
<dependency>
<groupId>junit
</groupId
>
<artifactId>junit
</artifactId
>
<version>4.12</version
>
</dependency
>
<!--common csv
-->
<!-- https
://mvnrepository
.com
/artifact
/org
.apache
.commons
/commons
-csv
-->
<dependency>
<groupId>org
.apache
.commons
</groupId
>
<artifactId>commons
-csv
</artifactId
>
<version>1.7</version
>
</dependency
>
</dependencies
>
<build>
<plugins>
<plugin>
<artifactId>maven
-assembly
-plugin
</artifactId
>
<configuration>
<archive>
<manifest>
<mainClass>com
.allen
.capturewebdata
.Main
</mainClass
>
</manifest
>
</archive
>
<descriptorRefs>
<descriptorRef>jar
-with
-dependencies
</descriptorRef
>
</descriptorRefs
>
</configuration
>
</plugin
>
</plugins
>
</build
>
</project
>
工具类ProductDataUtil.java
package product
.data
;
import cn
.binarywang
.tools
.generator
.ChineseAddressGenerator
;
import cn
.binarywang
.tools
.generator
.ChineseNameGenerator
;
import java
.util
.ArrayList
;
import java
.util
.List
;
import java
.util
.Random
;
import java
.util
.TreeMap
;
public class ProductDataUtil {
public static int productAge(){
Random random
= new Random();
int age
= 20 + random
.nextInt(25);
return age
;
}
public static int productSal(String job
){
Random random
= new Random();
int salary
= -1;
if(job
.contains("总监")){
salary
= 30000+ random
.nextInt(3) * 10000 * 2;
}else if (job
.contains("经理")){
salary
= 22000 + random
.nextInt(2) * 10000;
}else if (job
.contains("工程师")){
salary
= 13000 + random
.nextInt(2) * 10000;
}else if (job
.contains("专员")){
salary
= 15000;
}else {
salary
= 8000;
}
return salary
;
}
public static String
productHiredate(){
Random random
= new Random();
int year
= 2014 + random
.nextInt(6);
int month
= 1 + random
.nextInt(12);
int day
= 1 + random
.nextInt(31);
String hiredate
= year
+ "-" + month
+ "-" + day
;
return hiredate
;
}
public static String
productJob(String depNo
){
TreeMap
<String
, List
<String>> source
= new TreeMap<String
, List
<String>>();
List
<String> RSA001
= new ArrayList<String>();
RSA001
.add("产品总监");
RSA001
.add("产品经理");
RSA001
.add("产品专员");
RSA001
.add("产品助理");
List
<String> RSA002
= new ArrayList<String>();
RSA002
.add("设计总监");
RSA002
.add("设计经理");
RSA002
.add("设计专员");
RSA002
.add("设计助理");
List
<String> RSA003
= new ArrayList<String>();
RSA003
.add("技术总监");
RSA003
.add("技术经理");
RSA003
.add("算法工程师");
RSA003
.add("大数据工程师");
RSA003
.add("前端工程师");
RSA003
.add("后端工程师");
RSA003
.add("运维工程师");
List
<String> RSA004
= new ArrayList<String>();
RSA004
.add("运营总监");
RSA004
.add("运营经理");
RSA004
.add("运营专员");
RSA004
.add("运营助理");
List
<String> RSA005
= new ArrayList<String>();
RSA005
.add("财务总监");
RSA005
.add("财务经理");
RSA005
.add("预算主管");
RSA005
.add("出纳员");
RSA005
.add("审计员");
source
.put("RSA001_20190121", RSA001
);
source
.put("RSA002_20180623", RSA002
);
source
.put("RSA003_20150422", RSA003
);
source
.put("RSA004_20160903", RSA004
);
source
.put("RSA005_20120608", RSA005
);
List
<String> jobs
= source
.get(depNo
);
int index
= new Random().nextInt(jobs
.size());
return jobs
.get(index
);
}
public static String
productDeptNo() {
List
<String> source
= new ArrayList<String>();
source
.add("RSA001_20190121");
source
.add("RSA002_20180623");
source
.add("RSA003_20150422");
source
.add("RSA004_20160903");
source
.add("RSA005_20120608");
int index
= new Random().nextInt(source
.size());
return source
.get(index
);
}
public static String
productDepName(String deptNo
) {
String deptName
= "";
switch (deptNo
) {
case "RSA001_20190121":
deptName
= "产品部";
break;
case "RSA002_20180623":
deptName
= "设计部";
break;
case "RSA003_20150422":
deptName
= "技术部";
break;
case "RSA004_20160903":
deptName
= "运营部";
break;
case "RSA005_20120608":
deptName
= "财务部";
break;
}
return deptName
;
}
public static String
productAddress() {
String address
= ChineseAddressGenerator
.getInstance().generate();
return address
;
}
public static String
productName() {
String name
= ChineseNameGenerator
.getInstance().generate();
return name
;
}
}
生成数据ProductData.java
package product
.data
;
import org
.apache
.commons
.csv
.CSVFormat
;
import org
.apache
.commons
.csv
.CSVPrinter
;
import java
.io
.IOException
;
import java
.io
.PrintWriter
;
import java
.util
.ArrayList
;
import java
.util
.List
;
public class ProductData {
public static String
productEmp(){
String ename
= ProductDataUtil
.productName();
int age
= ProductDataUtil
.productAge();
String deptno
= ProductDataUtil
.productDeptNo();
String job
= ProductDataUtil
.productJob(deptno
);
String hiredate
= ProductDataUtil
.productHiredate();
int salary
= ProductDataUtil
.productSal(job
);
String source
= ename
+ "\t" + age
+ "\t" + job
+ "\t" + hiredate
+ "\t" + salary
+ "\t" + deptno
;
return source
;
}
public static List
productDept() {
List
<String> source
= new ArrayList<String>();
source
.add("RSA001_20190121" + "\t" + "产品部" + "\t" + "RSA001");
source
.add("RSA002_20180623" + "\t" + "设计部" + "\t" + "RSA002");
source
.add("RSA003_20150422" + "\t" + "技术部" + "\t" + "RSA003");
source
.add("RSA004_20160903" + "\t" + "运营部" + "\t" + "RSA004");
source
.add("RSA005_20120608" + "\t" + "财务部" + "\t" + "RSA005");
return source
;
}
public static void saveEmp(int nums
, String path
) throws IOException
{
Appendable out
= new PrintWriter(path
);
CSVPrinter csvPrinter
= CSVFormat
.DEFAULT
.
withHeader("ename", "age", "job", "hiredate", "salary", "deptno").print(out
);
for (int i
= 0; i
< nums
; i
++) {
String record
= productEmp();
csvPrinter
.printRecord(record
);
}
csvPrinter
.flush();
csvPrinter
.close();
}
public static void saveDept(String path
) throws IOException
{
Appendable out
= new PrintWriter(path
);
CSVPrinter csvPrinter
= CSVFormat
.DEFAULT
.
withHeader("deptno", "dname", "loc").print(out
);
List
<String> records
= productDept();
for (String record
: records
) {
csvPrinter
.printRecord(record
);
}
csvPrinter
.flush();
csvPrinter
.close();
}
public static void main(String
[] args
) throws IOException
{
saveDept(args
[0]);
saveEmp(Integer
.parseInt(args
[1]), args
[2]);
}
}
采用csv格式创建速度非常快
将项目打成jar包,上传至测试环境,运行jar包生成模拟数据
将jar包上传到服务器,执行命令如下:
java -cp Table_emp_dept-1.0-SNAPSHOT-jar-with-dependencies.jar product.data.ProductData './dept.csv' '10000000' './emp.csv'
说明:
1、Table_emp_dept-1.0-SNAPSHOT-jar-with-dependencies.jar 是我们maven打的jar包
2、product.data.ProductData 这个类的包名和路径
3、’./dept.csv’ ‘10000000’ ‘./emp.csv’ 生成部门信息,以及 人物信息,数据10000000条
4、生成路径: / 根目录
在hive中创建相应的表:
CREATE external table
if not exists
default.dept(
deptno STRING
,dname STRING
,loc STRING
)
row format delimited fields terminated by
'\t'
CREATE external table
if not exists
default.dept(
deptno STRING
,dname STRING
,loc STRING
)
row format delimited fields terminated by
'\t'
注意事项:CDH出现权限问题,/user 拒绝写入,则执行该语句修改权限
sudo -u hdfs hadoop fs -chmod 777 /user
附参考链接:https://blog.csdn.net/weixin_40040107/article/details/102787062
转载请注明原文地址: https://lol.8miu.com/read-29821.html