分析小文件信息

it2023-11-28  80

实现内容: hdfs 提取元数据mysql中hive元数据 及hdfs fsimage文件信息用于统计某个表下面有多少文件数 ,文件大小,平均值大小 用于分析小文件

脚本内容:

#!/bin/bash #设置脚本输出及异常自动退出 set -x set -e current_time=`date '+%Y-%m-%d %H:%M:%S'` echo "start time: $current_time" cd /data12/wqg/fsimage/ rm -rf /data12/wqg/fsimage/fsimage_* if [ ! -f "/data12/wqg/fsimage/fsimage.csv" ];then echo "fsimage.csv文件不存在" else echo "fsimage.csv文件存在需要删除" rm -rf /data12/wqg/fsimage/fsimage.csv fi #激活票据 kinit -kt /data12/wqg/keytab/hdfs.keytab hdfs@MYCDH #提取hdfs fsimage元数据文件 hdfs dfsadmin -fetchImage /data12/wqg/fsimage/ files=`ls /data12/wqg/fsimage/fsimage_*` for filename in $files do fsimage_name=$filename echo "$current_time>> fsimage名称:$filename" done #设置内存30G(最大使用) export HADOOP_OPTS="-Xmx30G" #将fsimage文件数据 转换为csv格式 可视的文件 hdfs oiv -i /data12/wqg/fsimage/$fsimage_name -o /data12/wqg/fsimage/fsimage.csv -p Delimited cd /data12/wqg/fsimage/ if [ ! -f "/data12/wqg/fsimage/fsimage.csv" ];then echo "fsimage.csv文件不存在 无法执行上传" else sed -i -e "1d" /data12/wqg/fsimage/fsimage.csv #上传到hdfs上 hadoop fs -put -f /data12/wqg/fsimage/fsimage.csv /tmp/rock/fsimg/ fi #激活hive票据 kinit -kt /data12/wqg/keytab/hive.keytab hive@MYCDH echo "$current_time: end fetimage-----------" echo "$current_time: mysql 元数据创建表操作" mysql -h(ip地址) -P(端口号) -u用户名 -p(密码) 库名 << EOF use test; drop table test.hive_tab_dir_detail; create table test.hive_tab_dir_detail as select 0 flag,c.NAME,c.DB_LOCATION_URI,a.TBL_NAME,a.CREATE_TIME,a.OWNER,a.TBL_TYPE,b.LOCATION from hive.TBLS a,hive.SDS b,hive.DBS c where a.SD_ID=b.SD_ID and a.DB_ID=c.DB_ID and a.tbl_id not in (select d.tbl_id from hive.partitions d where a.tbl_id=d.tbl_id ) union all select 1 flag,c.NAME,c.DB_LOCATION_URI,a.TBL_NAME,a.CREATE_TIME,a.OWNER,a.TBL_TYPE,concat(b.LOCATION,'/',d.part_name) LOCATION from hive.TBLS a,hive.SDS b,hive.DBS c,hive.partitions d where a.SD_ID=b.SD_ID and a.DB_ID=c.DB_ID and a.tbl_id = d.tbl_id and a.tbl_id in (select d.tbl_id from hive.partitions d where a.tbl_id=d.tbl_id ); EOF #将数据抽取至hdfs echo "$current_time: start sqoop" sqoop import \ --connect "jdbc:mysql://ip:port/test" \ --username root \ --password admin123 \ --table hive_tab_dir_detail \ --fields-terminated-by ',' \ --delete-target-dir \ --target-dir /tmp/wqg/hive_meta_tab_detail \ --m 1 echo "$current_time: start hive -e" #没有启动beeline 或者kerberos 直接用hive -e beeline -u "jdbc:hive2://ip:port/default;principal=hive/主体" -e " use rhao; #创建mysql中对应的元数据表 CREATE EXTERNAL TABLE if not exists rhao.wqg_hive_meta_tab_detail( flag string, name string, db_location_uri string, tbl_name string, create_time bigint, owner string, tbl_type string, location string ) row format delimited fields terminated by ',' LOCATION '/tmp/wqg/hive_meta_tab_detail'; #创建hdfs fsimage对应的表 CREATE EXTERNAL TABLE if not exists rhao.HDFS_META_TEMP( PATH STRING, REPL INT, MODIFICATION_TIME STRING, ACCESSTIME STRING, PREFERREDBLOCKSIZE INT, BLOCKCOUNT DOUBLE, FILESIZE DOUBLE, NSQUOTA INT, DSQUOTA INT, PERMISSION STRING, USERNAME STRING, GROUPNAME STRING ) row format delimited fields terminated by '\t' LOCATION '/tmp/rock/fsimg/'; drop table rhao.hdfs_dir_stat_2; create table rhao.hdfs_dir_stat_2 as select regexp_extract(regexp_extract(path,'(\/([^\/]+\/)+)',1),'((\/[^\/]+)+)',1) dir, modification_time, accesstime, count(1) file_nums, sum(blockcount) blockcounts, sum(filesize) filesizes, sum(filesize)/count(1) avg_filesize from rhao.hdfs_meta_temp where repl <> 0 group by regexp_extract(regexp_extract(path,'(\/([^\/]+\/)+)',1),'((\/[^\/]+)+)',1),modification_time,accesstime; drop table rhao.hive_last_dir_stat_all_2; create table rhao.hive_last_dir_stat_all_2 as select a.name db_name, a.tbl_name table_name, a.create_time, a.tbl_type tbl_type , regexp_replace(a.location,'hdfs://nameservice1','') location, b.modification_time, b.accesstime, b.file_nums, b.blockcounts, b.filesizes, b.avg_filesize from rhao.wqg_hive_meta_tab_detail a , rhao.hdfs_dir_stat_2 b where regexp_replace(a.location,'hdfs://nameservice1','') = b.dir; drop table rhao.hive_tab_dir_stat_sort_all_2; create table rhao.hive_tab_dir_stat_sort_all_2 as select db_name, table_name, create_time, modification_time, accesstime, tbl_type, sum(file_nums) file_nums, sum(blockcounts) blockcounts, sum(filesizes) filesizes, sum(filesizes)/sum(file_nums) avg_filesizes from rhao.hive_last_dir_stat_all_2 group by db_name,table_name,tbl_type,create_time,modification_time,accesstime sort by file_nums desc; create table rhao.hive_tab_dir_stat_sort_all_2_sort as select db_name, table_name, from_unixtime(create_time,'yyyy-MM-dd HH:mm:ss') create_time, tbl_type, max(from_unixtime(unix_timestamp(concat(a.modification_time,':00')),'yyyy-MM-dd HH:mm:ss')) modification_time, max(from_unixtime(unix_timestamp(concat(a.accesstime,':00')),'yyyy-MM-dd HH:mm:ss')) accesstime, sum(file_nums) file_nums, sum(blockcounts) blockcounts, sum(filesizes) filesizes, sum(filesizes)/sum(file_nums) avg_filesizes from rhao.hive_last_dir_stat_all_2 a group by db_name,table_name,tbl_type,create_time sort by file_nums desc; " echo "end time: $current_time"
最新回复(0)