main_db_data = [(1, "Product A", 100), (2, "Product B", 50), (3, "Product C", 75)]
supplier_feed_data = [(1, None, 120), (2, "Product B", None), (3, "Product C", 90)]
columns = ["ProductID", "Description", "AvailableQty"]
main_db_df = spark.createDataFrame(main_db_data, columns)
supplier_feed_df = spark.createDataFrame(supplier_feed_data, columns)
joined_df = main_db_df.alias("main").join(
supplier_feed_df.alias("supplier"), on="ProductID"
)
merged_df = (
joined_df
.withColumn("FinalDescription", coalesce(
col("main.Description"), col("supplier.Description")))
.withColumn("FinalAvailableQty", coalesce(
col("supplier.AvailableQty"), col("main.AvailableQty")))
)
merged_df.show()
+---------+-----------+------------+-----------+------------+----------------+-----------------+
|ProductID|Description|AvailableQty|Description|AvailableQty|FinalDescription|FinalAvailableQty|
+---------+-----------+------------+-----------+------------+----------------+-----------------+
| 1| Product A| 100| NULL| 120| Product A| 120|
| 2| Product B| 50| Product B| NULL| Product B| 50|
| 3| Product C| 75| Product C| 90| Product C| 90|
+---------+-----------+------------+-----------+------------+----------------+-----------------+
Merge or handle missing data with
Coalesce
By merging these data sources, we create a unified view of product availability, prioritizing supplier data and filling in missing values intelligently
1. Check description exist in main
2. Check description exist in supplier
3. Set Null, if not exist in both
Efficiently merge data from different sources while handling missing values using the coalesce function!
PySpark Tip⚡️