In this example, we will extract the size of the HDD and its type into separate columns, and will then calculate the minimum volume needed to put each computer in boxes:
import pyspark.sql as sql
import pyspark.sql.functions as f
sample_data_transformed = (
sample_data_df
.rdd
.map(lambda row: sql.Row(
**row.asDict()
, HDD_size=row.HDD.split(' ')[0]
)
)
.map(lambda row: sql.Row(
**row.asDict()
, HDD_type=row.HDD.split(' ')[1]
)
)
.map(lambda row: sql.Row(
**row.asDict()
, Volume=row.H * row.D * row.W
)
)
.toDF()
.select(
sample_data_df.columns +
[
'HDD_size'
, 'HDD_type'
, f.round(
f.col('Volume')
).alias('Volume_cuIn')
]
)
)