pyspark - jjin-choi/study_note GitHub Wiki
pyspark μμ duplicate μμ λ λ°©λ²
pyspark μμ μ¬λ¬ DB μ μνλ λ°©λ²
from pyspark.sql import SparkSession
# mongo DB
spark = SparkSession.builder
.appName('mongoDB')
.config('spark.mongodb.input.uri','mongodb://10.230.74.24:27020/dvlr.sim_log')
.config('spark.mongodb.output.uri','mongodb://10.230.74.24:27020/dvlr.sim_log')
.getOrCreate()
df = spark.read.format('mongo')
.option('uri','mongodb://10.230.74.24:27020/dvlr.sim_log')
.load()
# postgreSQL
spark = SparkSession.builder
.appName('Pyspark connected with Postgre')
.config('spark.jars', 'postgresql-42.2.23.jar')
.getOrCreate()
df = spark.read.format('jdbc')
.option('url', 'jdbc:postgresql://10.230.74.162:5432/ibuilder')
.option('dbtable', 'NRE_LICENSE')
.option('user','jongwoo6969')
.option('password','fbwhddn77^^')
.option('driver', 'org.postgresql.Driver')
.load()
pyspark function
- collect() : ν΄λΉ dataframe μ λͺ¨λ row λ₯Ό λ°ν
import pandas as pd
# df : spark DataFrame
pddf = pd.DataFrame(df.collect())
- cache() : μμ£Ό λΆλ¦¬κ² λλ dataframe μ cache λΌλ ν¨μλ₯Ό μ¬μ©νμ¬ λ©λͺ¨λ¦¬μ λ¨κ²¨λ