你好,游客 登录
背景:
阅读新闻

Spark序列化错误:java.io.NotSerializableException

[日期:2018-09-15] 来源:csdn  作者: [字体: ]
由于spark算子用到的class没有实现序列化,报错如下所示
15/11/23 14:43:47 ERROR Executor: Exception in task 0.0 in stage 4.0 (TID 4)
java.io.NotSerializableException: EntityMention
Serialization stack:
	- object not serializable (class: EntityMention, value: EntityMention@5cdadff6)
	at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
	at org.apache.spark.serializer.SerializationStream.writeValue(Serializer.scala:147)
	at org.apache.spark.storage.DiskBlockObjectWriter.write(DiskBlockObjectWriter.scala:181)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.insertAll(BypassMergeSortShuffleWriter.java:121)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:73)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
	at org.apache.spark.scheduler.Task.run(Task.scala:88)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:744)
15/11/23 14:43:47 ERROR TaskSetManager: Task 0.0 in stage 4.0 (TID 4) had a not serializable result: EntityMention
Serialization stack:
	- object not serializable (class: EntityMention, value: EntityMention@5cdadff6); not retrying
15/11/23 14:43:47 INFO TaskSchedulerImpl: Removed TaskSet 4.0, whose tasks have all completed, from pool 
15/11/23 14:43:47 INFO TaskSchedulerImpl: Cancelling stage 4
15/11/23 14:43:47 INFO DAGScheduler: ShuffleMapStage 4 (groupBy at Relation_Detector.scala:78) failed in 0.081 s
15/11/23 14:43:47 INFO DAGScheduler: Job 4 failed: collect at Relation_Detector.scala:79, took 0.097801 s
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0.0 in stage 4.0 (TID 4) had a not serializable result: EntityMention
Serialization stack:
	- object not serializable (class: EntityMention, value: EntityMention@5cdadff6)
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1822)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1835)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1848)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1919)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:905)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:306)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:904)
	at Relation_Detector$.main(Relation_Detector.scala:79)
	at Relation_Detector.main(Relation_Detector.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:606)
	at com.intellij.rt.execution.application.AppMain.main(AppMain.java:140)
15/11/23 14:43:47 INFO SparkContext: Invoking stop() from shutdown hook

解决方法:对类实现序列化Serializable

class EntityMention(val pentitymention_id:String,var pname:String,var pentitymention_type:String,
                    var pHbaseID:String,var ptopicID:String,var ppost_type:String,
                    var psenIndex:String,var psenOffset:String, var ppostTime:String,
                    var pinsertTime:String,var pentityID:String) extends Serializable {
  var entitymention_id = pentitymention_id
  var name = pname
  var entitymention_type = pentitymention_type
  var HbaseID = pHbaseID
  var topicID= ptopicID
  var post_type = ppost_type
  var senIndex = psenOffset
  var senOffset = psenOffset
  var postTime = ppostTime
  var insertTime = pinsertTime
  var entityID = pentityID
}
收藏 推荐 打印 | 阅读: