In this example, we will use SciPy to return a value of a normal probability distribution function (PDF) for a set of 1,000,000 random numbers between 0 and 1:
import pyspark.sql.functions as f
import pandas as pd
from scipy import stats
big_df = (
spark
.range(0, 1000000)
.withColumn('val', f.rand())
)
big_df.cache()
big_df.show(3)
@f.pandas_udf('double', f.PandasUDFType.SCALAR)
def pandas_pdf(v):
return pd.Series(stats.norm.pdf(v))
(
big_df
.withColumn('probability', pandas_pdf(big_df.val))
.show(5)
)