from pyspark.sql import SQLContext
sc = SparkContext("local", "Simple App")
sqlContext = SQLContext(sc)
url = \
"jdbc:mysql://localhost:3306/stock_data?user=root&password=test"
df = sqlContext \
.read \
.format("jdbc") \
.option("url", url) \
.option("dbtable", "stock_detail_collect") \
.load()
df.printSchema()
counts = df.groupBy("stock_id").count()
counts.show()
===========
怎么数据表只有1153194条记录,怎么运行以上代码就内存泄露:
16/02/05 23:30:28 WARN TaskMemoryManager: leak 8.3 MB memory from org.apache.spark.unsafe.map.BytesToBytesMap@431395b1
16/02/05 23:30:28 ERROR Executor: Managed memory leak detected; size = 8650752 bytes, TID = 1
环境:spark-1.6.0-bin-hadoop2.6
Ubuntu 14.04.3 LTS
jdk1.8.0_66
不知问题在哪?怎么破,非常感谢
1 个解决方案
#1
counts = df.groupBy("stock_id").count()
counts.show()
改为写入文件:
df.registerTempTable("people")
count = sqlContext.sql("select stock_id, count(*) as c from people group by stock_id order by stock_id")
for name in count.collect():
file_output.write(str(name))
file_output.flush()
file_output.close()
counts.show()
改为写入文件:
df.registerTempTable("people")
count = sqlContext.sql("select stock_id, count(*) as c from people group by stock_id order by stock_id")
for name in count.collect():
file_output.write(str(name))
file_output.flush()
file_output.close()
#1
counts = df.groupBy("stock_id").count()
counts.show()
改为写入文件:
df.registerTempTable("people")
count = sqlContext.sql("select stock_id, count(*) as c from people group by stock_id order by stock_id")
for name in count.collect():
file_output.write(str(name))
file_output.flush()
file_output.close()
counts.show()
改为写入文件:
df.registerTempTable("people")
count = sqlContext.sql("select stock_id, count(*) as c from people group by stock_id order by stock_id")
for name in count.collect():
file_output.write(str(name))
file_output.flush()
file_output.close()