I am trying to load my Postgres database into Spark using PySpark:
我正在尝试使用PySpark将我的Postgres数据库加载到Spark中:
from pyspark import SparkContext
from pyspark import SparkConf
from random import random
#spark conf
conf = SparkConf()
conf.setMaster("spark://spark-master:7077")
conf.setAppName('pyspark')
sc = SparkContext(conf=conf)
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
properties = {
"user": "postgres",
"password": "password123",
"driver": "org.postgresql.Driver"
}
url = "jdbc.postgresql://<POSTGRES_IP>/DB_NAME"
df = sqlContext.read.jdbc(url=url, table='myTable', properties=properties)
I get the following error, which I have no idea what it means.
我得到了下面的错误,我不知道这是什么意思。
/opt/spark/python/pyspark/sql/readwriter.pyc in jdbc(self, url, table, column, lowerBound, upperBound, numPartitions, predicates, properties)
420 jpredicates = utils.toJArray(gateway, gateway.jvm.java.lang.String, predicates)
421 return self._df(self._jreader.jdbc(url, table, jpredicates, jprop))
--> 422 return self._df(self._jreader.jdbc(url, table, jprop))
423
424
/usr/local/lib/python2.7/dist-packages/py4j/java_gateway.pyc in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
/opt/spark/python/pyspark/sql/utils.pyc in deco(*a, **kw)
77 raise QueryExecutionException(s.split(': ', 1)[1], stackTrace)
78 if s.startswith('java.lang.IllegalArgumentException: '):
---> 79 raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
80 raise
81 return deco
IllegalArgumentException: u'Wrong FS: file://spark-warehouse, expected: file:///'
1 个解决方案
#1
1
Specifying an existing directory for the sql warehouse directory setting should solve your issue. For example at job launch time:
为sql warehouse目录设置指定一个现有目录应该可以解决您的问题。例如在工作发布时间:
./bin/spark-submit --conf spark.sql.warehouse.dir=/tmp/ \
... # other options
your_file.py \
[application-arguments]
#1
1
Specifying an existing directory for the sql warehouse directory setting should solve your issue. For example at job launch time:
为sql warehouse目录设置指定一个现有目录应该可以解决您的问题。例如在工作发布时间:
./bin/spark-submit --conf spark.sql.warehouse.dir=/tmp/ \
... # other options
your_file.py \
[application-arguments]