FlinkSQL Kafka to Hive

it2026-02-27 10

基于CDH平台，Hive2.1.1-CDH6.2.0 ，写入Hive简单操作。

Hive Streaming

import java.time.Duration import com.sm.common.conf.PropManager import com.sm.constants.Constants import com.sm.utils.FlinkUtils import org.apache.flink.streaming.api.{CheckpointingMode, TimeCharacteristic} import org.apache.flink.streaming.api.environment.{CheckpointConfig, ExecutionCheckpointingOptions} import org.apache.flink.streaming.api.scala._ import org.apache.flink.table.api.SqlDialect import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment import org.apache.flink.table.catalog.hive.HiveCatalog import org.apache.log4j.Level import org.slf4j.LoggerFactory /** * flink sql kafka to hive * * create by LiuJinHe 2020/10/22 */ object FlinkSqlKafka2Hive { private var logger: org.slf4j.Logger = _ def main(args: Array[String]): Unit = { logger = LoggerFactory.getLogger(this.getClass.getSimpleName) org.apache.log4j.Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN) org.apache.log4j.Logger.getLogger("org.apache").setLevel(Level.INFO) // 初始化 stream 环境 // 本地测试，需要 flink-runtime-web 依赖 val env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI() // val env = StreamExecutionEnvironment.getExecutionEnvironment // 参数读取 // val params = ParameterTool.fromArgs(args) // env.getConfig.setGlobalJobParameters(params) // 失败重启,固定间隔,每隔3秒重启1次,总尝试重启10次 // env.setRestartStrategy(RestartStrategies.fixedDelayRestart(10, 3)) // 本地测试线程 1 env.setParallelism(1) // 事件处理的时间，由系统时间决定 env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime) // 创建 streamTable 环境 val tableEnv: StreamTableEnvironment = StreamTableEnvironment.create(env, FlinkUtils.getSettings) // checkpoint 设置 val tableConfig = tableEnv.getConfig.getConfiguration tableConfig.set(ExecutionCheckpointingOptions.CHECKPOINTING_MODE, CheckpointingMode.EXACTLY_ONCE) // checkpoint的超时时间周期，1 分钟做一次checkpoint tableConfig.set(ExecutionCheckpointingOptions.CHECKPOINTING_INTERVAL, Duration.ofSeconds(30)) // checkpoint的超时时间, 检查点一分钟内没有完成将被丢弃 // tableConfig.set(ExecutionCheckpointingOptions.CHECKPOINTING_TIMEOUT, Duration.ofSeconds(60)) // checkpoint 最小间隔，两个检查点之间至少间隔 30 秒 // tableConfig.set(ExecutionCheckpointingOptions.MIN_PAUSE_BETWEEN_CHECKPOINTS, Duration.ofSeconds(30)) // 同一时间只允许进行一个检查点 // tableConfig.set(ExecutionCheckpointingOptions.MAX_CONCURRENT_CHECKPOINTS, Integer.valueOf(1)) // 手动cancel时是否保留checkpoint // tableConfig.set(ExecutionCheckpointingOptions.EXTERNALIZED_CHECKPOINT, // CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION) // 设置状态的最小空闲时间和最大的空闲时间 // tableEnv.getConfig.setIdleStateRetentionTime(Time.hours(12), Time.hours(24)) // 加载配置 val catalog_name = PropManager.getProp(Constants.CATALOG_NAME) val database = PropManager.getProp(Constants.DEFAULT_DATABASE) val schemaDataBase = PropManager.getProp(Constants.SCHEMA_DATABASE) // 构造 hive catalog val hiveCatalog = new HiveCatalog( catalog_name, database, PropManager.getProp(Constants.HIVE_CONF_DIR) ) tableEnv.registerCatalog(catalog_name, hiveCatalog) tableEnv.useCatalog(catalog_name) // 构造 kafka source, 用 DEFAULT tableEnv.getConfig.setSqlDialect(SqlDialect.DEFAULT) tableEnv.useDatabase(schemaDataBase) // 如果catalog中 kafka source 已建表，先删除 tableEnv.executeSql("drop table if exists kafkaSourceTable") // kafka source table val kafkaSourceDDL = """ |create table kafka_source_Table ( | user_name string, | user_id bigint, | `time` bigint, | ts as to_timestamp(from_unixtime(`time` / 1000, 'yyyy-MM-dd HH:mm:ss')), | rt as proctime() |) with ( | 'connector' = 'kafka', | 'topic' = 'flink_test', | 'properties.bootstrap.servers' = '192.168.100.39:9092', | 'properties.group.id' = 'flink-test-group', | 'format' = 'json', | 'scan.startup.mode' = 'group-offsets' |) """.stripMargin tableEnv.executeSql(kafkaSourceDDL) // hive catalog tableEnv.getConfig.setSqlDialect(SqlDialect.HIVE) tableEnv.useDatabase(database) // 删除表，已有 hive 表的情况下不用删除和建表 // tableEnv.executeSql("drop table if exists test_hive_table") // hive sink table // 每次 checkpoint 完成后才会真正写入 Hive, 生成一个文件 val hiveSinkDDL = """ |create table hive_sink_table ( | user_name string, | user_id bigint, | `time` bigint, | `date` string |) |partitioned by (`date` string) |row format delimited fields terminated by '\t' |stored as orc |location 'hdfs://BigdataCluster/user/hive/warehouse/test_data.db/test/test_hive_table' |tblproperties ( | 'orc.compress'='SNAPPY', | 'partition.time-extractor.timestamp-pattern' = '$date 00:00:00', | 'sink.partition-commit.trigger' = 'process-time', | 'sink.partition-commit.policy.kind' = 'metastore,success-file' |) """.stripMargin tableEnv.executeSql(hiveSinkDDL) // 'sink.partition-commit.delay' = '10 s', // 写数据 val insertSql = """ |insert into hive_sink_table |select | user_name, | user_id, | `time`, | date_format(ts, 'yyyy-MM-dd') as `date` |from schema_data.kafka_source_Table """.stripMargin tableEnv.executeSql(insertSql) // 测试打印输入数据, 1. 以 connector=print 打印; 2. 转成 dataStream.print // val querySql = // """ // |select // | user_name, // | user_id, // | date_format(ts, 'yyyy-MM-dd') as `date`, // | date_format(ts, 'HH'), // | date_format(ts, 'mm') // |from kafkaSourceTable // """.stripMargin // val table = tableEnv.sqlQuery(querySql) // table.printSchema() // // val resultDStream = tableEnv.toAppendStream[Row](table) // resultDStream.print() // env.execute("flink sql kafka 2 hive") } }

pom

<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <artifactId>realtime-flink-platform-test</artifactId> <groupId>com.sm.com.sm.job</groupId> <version>1.0-SNAPSHOT</version> <modelVersion>4.0.0</modelVersion> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <maven.compiler.source>1.8</maven.compiler.source> <maven.compiler.target>1.8</maven.compiler.target> <scala.version>2.11.12</scala.version> <scala.binary.version>2.11</scala.binary.version> <flink.version>1.11.2</flink.version> <hive.version>2.1.1-cdh6.2.0</hive.version> <hadoop.version>3.0.0-cdh6.2.0</hadoop.version> </properties> <repositories> <repository> <id>cloudera</id> <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url> <releases> <enabled>true</enabled> </releases> <snapshots> <enabled>false</enabled> </snapshots> </repository> </repositories> <dependencies> <dependency> <groupId>com.sm.common</groupId> <artifactId>common</artifactId> <version>1.0-SNAPSHOT</version> </dependency> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> <version>${scala.version}</version>  </dependency>  <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-runtime-web_${scala.binary.version}</artifactId> <version>${flink.version}</version>  </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-core</artifactId> <version>${flink.version}</version>  </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-scala_${scala.binary.version}</artifactId> <version>${flink.version}</version>  </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-common</artifactId> <version>${flink.version}</version>  </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-api-scala-bridge_${scala.binary.version}</artifactId> <version>${flink.version}</version>  </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-scala_${scala.binary.version}</artifactId> <version>${flink.version}</version>  </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-table-planner-blink_${scala.binary.version}</artifactId> <version>${flink.version}</version>  </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-clients_${scala.binary.version}</artifactId> <version>${flink.version}</version>  </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-json</artifactId> <version>${flink.version}</version>  </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-orc_${scala.binary.version}</artifactId> <version>${flink.version}</version> <scope>provided</scope> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-sql-connector-kafka_${scala.binary.version}</artifactId> <version>${flink.version}</version>  </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-sql-connector-hive-2.2.0_${scala.binary.version}</artifactId> <version>${flink.version}</version>  </dependency>  <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-serde</artifactId> <version>${hive.version}</version>  <exclusions> <exclusion> <groupId>org.apache.logging.log4j</groupId> <artifactId>log4j-slf4j-impl</artifactId> </exclusion> <exclusion> <groupId>org.apache.hive</groupId> <artifactId>hive-common</artifactId> </exclusion> <exclusion> <groupId>org.apache.hive</groupId> <artifactId>hive-service-rpc</artifactId> </exclusion> <exclusion> <groupId>org.apache.parquet</groupId> <artifactId>parquet-hadoop-bundle</artifactId> </exclusion> </exclusions> </dependency> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-exec</artifactId> <version>${hive.version}</version>  <exclusions> <exclusion> <groupId>org.apache.logging.log4j</groupId> <artifactId>log4j-slf4j-impl</artifactId> </exclusion> <exclusion> <groupId>org.apache.hive</groupId> <artifactId>hive-llap-tez</artifactId> </exclusion> </exclusions> </dependency>  <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>${hadoop.version}</version>  </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>${hadoop.version}</version> <scope>provided</scope> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>${hadoop.version}</version> <scope>provided</scope> </dependency> </dependencies> <build> <plugins>  <plugin> <groupId>net.alchim31.maven</groupId> <artifactId>scala-maven-plugin</artifactId> <version>4.2.0</version> <executions> <execution> <goals>  <goal>compile</goal> </goals> </execution> </executions> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-assembly-plugin</artifactId> <version>3.0.0</version> <configuration> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> </configuration> <executions> <execution> <id>make-assembly</id> <phase>package</phase> <goals> <goal>single</goal> </goals> </execution> </executions> </plugin> </plugins> </build>

提交集群时，在 $FLINK_HOME/lib 下放入

flink-sql-connector-hive-2.2.0_2.11-1.11.2.jar

最新回复(0)