java spark-streaming接收TCP/Kafka数据
时间:2022-05-02
本文章向大家介绍java spark-streaming接收TCP/Kafka数据,主要内容包括其使用实例、应用技巧、基本知识点总结和需要注意事项,具有一定的参考价值,需要的朋友可以参考一下。
本文将展示
1、如何使用spark-streaming接入TCP数据并进行过滤;
2、如何使用spark-streaming接入TCP数据并进行wordcount;
内容如下:
1、使用maven,先解决pom依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka_2.10</artifactId>
<version>1.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.10</artifactId>
<version>1.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>1.6.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.10</artifactId>
<version>1.6.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.10</artifactId>
<version>1.6.0</version>
<scope>provided</scope>
</dependency>
1、接收TCP数据并过滤,打印含有error的行
package com.xiaoju.dqa.realtime_streaming;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.Durations;
//nc -lk 9999
public class SparkStreamingTCP {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local").setAppName("streaming word count");
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(1));
JavaDStream<String> lines = jssc.socketTextStream("10.93.21.21", 9999);
JavaDStream<String> errorLines = lines.filter(new Function<String, Boolean>() {
@Override
public Boolean call(String s) throws Exception {
return s.contains("error");
}
});
errorLines.print();
jssc.start();
jssc.awaitTermination();
}
}
执行方法
$ spark-submit realtime-streaming-1.0-SNAPSHOT-jar-with-dependencies.jar
# 另起一个窗口
$ nc -lk 9999
# 输入数据
2、接收Kafka数据并进行计数(WordCount)
package com.xiaoju.dqa.realtime_streaming;
import java.util.*;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.api.java.*;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.apache.spark.streaming.Durations;
import scala.Tuple2;
// bin/kafka-console-producer.sh --broker-list localhost:9092 --topic test
public class SparkStreamingKafka {
public static void main(String[] args) throws InterruptedException {
SparkConf conf = new SparkConf().setMaster("yarn-client").setAppName("streaming word count");
//String topic = "offline_log_metrics";
String topic = "test";
int part = 1;
JavaSparkContext sc = new JavaSparkContext(conf);
sc.setLogLevel("WARN");
JavaStreamingContext jssc = new JavaStreamingContext(sc, Durations.seconds(10));
Map<String ,Integer> topicMap = new HashMap<String, Integer>();
String[] topics = topic.split(";");
for (int i=0; i<topics.length; i++) {
topicMap.put(topics[i], 1);
}
List<JavaPairReceiverInputDStream<String, String>> list = new ArrayList<JavaPairReceiverInputDStream<String, String>>();
for (int i = 0; i < part; i++) {
list.add(KafkaUtils.createStream(jssc,
"10.93.21.21:2181",
"bigdata_qa",
topicMap));
}
JavaPairDStream<String, String> wordCountLines = list.get(0);
for (int i = 1; i < list.size(); i++) {
wordCountLines = wordCountLines.union(list.get(i));
}
JavaPairDStream<String, Integer> counts = wordCountLines.flatMap(new FlatMapFunction<Tuple2<String, String>, String>(){
@Override
public Iterable<String> call(Tuple2<String, String> stringStringTuple2){
List<String> list2 = null;
try {
if ("".equals(stringStringTuple2._2) || stringStringTuple2._2 == null) {
System.out.println("_2 is null");
throw new Exception("_2 is null");
}
list2 = Arrays.asList(stringStringTuple2._2.split(" "));
} catch (Exception ex) {
ex.printStackTrace();
System.out.println(ex.getMessage());
}
return list2;
}
}).mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String s) throws Exception {
Tuple2<String, Integer> tuple2 = null;
try {
if (s==null || "".equals(s)) {
tuple2 = new Tuple2<String, Integer>(s, 0);
throw new Exception("s is null");
}
tuple2 = new Tuple2<String, Integer>(s, 1);
} catch (Exception ex) {
ex.printStackTrace();
}
return tuple2;
}
}).reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer x, Integer y) throws Exception {
return x + y;
}
});
counts.print();
jssc.start();
try {
jssc.awaitTermination();
} catch (Exception ex) {
ex.printStackTrace();
} finally {
jssc.close();
}
}
}
执行方法
$ spark-submit --queue=root.XXX realtime-streaming-1.0-SNAPSHOT-jar-with-dependencies.jar
# 另开一个窗口,启动kafka生产者
$ bin/kafka-console-producer.sh --broker-list localhost:9092 --topic test
# 输入数据
- 聚合索引(clustered index) / 非聚合索引(nonclustered index)
- 域名资讯:单词域名can.com以15.5万美金成功交易
- jQuery无缝图片横向(水平)/竖向(垂直)滚动
- Centos下MooseFS(MFS)分布式存储共享环境部署记录
- MFS+Keepalived双机高可用热备方案操作记录
- Docker容器学习梳理-容器时间跟宿主机时间同步
- AS1.0(2.0)中的XML示例
- kvm虚拟机日常操作命令梳理
- mongodb 总结
- 关于微信小程序内置组件swiper,circular使用分享
- zabbix问题记录
- MSDTC 故障排除
- 洪泰智造工场&腾讯云创业加速营全球招募
- MySQL存储引擎之Myisam和Innodb总结性梳理
- java教程
- Java快速入门
- Java 开发环境配置
- Java基本语法
- Java 对象和类
- Java 基本数据类型
- Java 变量类型
- Java 修饰符
- Java 运算符
- Java 循环结构
- Java 分支结构
- Java Number类
- Java Character类
- Java String类
- Java StringBuffer和StringBuilder类
- Java 数组
- Java 日期时间
- Java 正则表达式
- Java 方法
- Java 流(Stream)、文件(File)和IO
- Java 异常处理
- Java 继承
- Java 重写(Override)与重载(Overload)
- Java 多态
- Java 抽象类
- Java 封装
- Java 接口
- Java 包(package)
- Java 数据结构
- Java 集合框架
- Java 泛型
- Java 序列化
- Java 网络编程
- Java 发送邮件
- Java 多线程编程
- Java Applet基础
- Java 文档注释
- [数据结构与算法]赫夫曼树与赫夫曼编码
- [数据结构与算法] 查找算法
- Kafka——分布式的消息队列
- Scala——多范式, 可伸缩, 类似Java的编程语言
- Storm——分布式实时流式计算框架
- Vue实现复制excel表格内容粘贴至网页
- Sqoop——将关系数据库数据与hadoop数据进行转换的工具
- Sqoop导入数据时异常java.net.ConnectException: Connection refused
- Flume——高可用的、高可靠的、分布式日志收集系统
- Hadoop技术(三)数据仓库工具Hive
- Hadoop技术(一)分布式文件系统HDFS
- Docker技术( 容器虚拟化技术 )
- 分布式事务处理技术之LCN
- Swagger技术(接口文档实时动态生成工具)
- 大数据学习之Linux基础