Data Engineering 101 SQL and PySpark 1727161935
Data Engineering 101 SQL and PySpark 1727161935
Data Engineering 101 SQL and PySpark 1727161935
Engineering 101
SQL and
PySpark
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.select("*")
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.select("col1", "col2")
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.filter("condition")
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
ORDERING ROWS
SQL
PYSPARK
df.orderBy("col1")
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.orderBy(df.col1.desc())
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
LIMITING ROWS
SQL
PYSPARK
df.limit(10)
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.select("col1").distinct()
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.groupBy("col1").count()
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.groupBy("col1").avg("col2")
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
INNER JOIN
SQL
SELECT *
FROM table1
INNER JOIN table2
ON table1.id = table2.id;
PYSPARK
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
LEFT JOIN
SQL
PYSPARK
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
RIGHT JOIN
SQL
SELECT * FROM table1
RIGHT JOIN table2
ON table1.id = table2.id;
PYSPARK
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
SUBQUERIES
SQL
SELECT * FROM (SELECT col1, col2 FROM
table) sub_table;
PYSPARK
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
CASE STATEMENTS
SQL
SELECT col1,
CASE WHEN condition
THEN result ELSE result2 END
FROM table;
PYSPARK
df.select("col1", when(condition,
result).otherwise(result2).alias("new_col"))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.withColumn("row_num",
row_number().over(Window.orderBy("col2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.withColumn("sum_col2", sum("col2") \
.over(Window.partitionBy("col3")))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
LAG FUNCTION
SQL
SELECT col1,
LAG(col2, 1) OVER (ORDER BY col3)
AS lag_col2 FROM table;
PYSPARK
df.withColumn("lag_col2", lag("col2", 1) \
.over(Window.orderBy("col3")))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
LEAD FUNCTION
SQL
SELECT col1,
LEAD(col2, 1) OVER (ORDER BY col3) AS lead_col2
FROM table;
PYSPARK
df.withColumn("lead_col2", lead("col2", 1) \
.over(Window.orderBy("col3")))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.filter(df.col1.isNull())
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.filter(df.col1.isNotNull())
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df1.union(df2)
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df1.intersect(df2)
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df1.subtract(df2)
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.createOrReplaceTempView("temp_table")
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PIVOTING DATA
SQL
SELECT * FROM
(SELECT col1, col2 FROM table)
PIVOT (SUM(col2) FOR col1 IN ('value1', 'value2'));
PYSPARK
df.groupBy()\
.pivot("col1", ['value1', 'value2']).sum("col2")
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
UPDATING ROWS
SQL
UPDATE table SET col1 = value WHERE
condition;
PYSPARK
df = df.withColumn("col1", when(condition,
value).otherwise(df.col1))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
DELETING ROWS
SQL
PYSPARK
df = df.filter(~condition)
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
HANDLING DUPLICATES
SQL
SELECT col1, COUNT(*) FROM table
GROUP BY col1 HAVING COUNT(*) > 1;
PYSPARK
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
CALCULATING PERCENTAGE
SQL
SELECT col1, (col2 / col3) * 100 AS percentage
FROM table;
PYSPARK
df.withColumn("percentage", (df.col2 /
df.col3) * 100)
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.select(concat("col1", "col2") \
.alias("new_col"))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.select(current_date().alias("today"))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.select(year("date_col").alias("year"))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
CONDITIONAL AGGREGATION
SQL
SELECT SUM(CASE WHEN condition THEN col1 ELSE 0 END)
FROM table;
PYSPARK
df.select(sum(when(condition, df.col1) \
.otherwise(0)))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
RENAMING COLUMNS
SQL
PYSPARK
df.withColumnRenamed("col1", "new_col1")
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
DROPPING COLUMNS
SQL
PYSPARK
df.drop("col1")
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.withColumn("new_col", expression)
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
REPLACING VALUES
SQL
PYSPARK
df.withColumn("col1", when(condition,
new_value).otherwise(df.col1))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.select(my_udf("col1"))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
EXPLODING ARRAYS
SQL
SELECT col1, EXPLODE(array_col) AS exploded_col
FROM table;
PYSPARK
df.select("col1", explode("array_col") \
.alias("exploded_col"))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.select("col1", "nested_col.*")
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.select(df.col1.cast("data_type"))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.select(get_json_object("json_col", "$.key"))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
GROUPING SETS
SQL
SELECT col1, col2, SUM(col3)
FROM table
GROUP BY GROUPING SETS ((col1), (col2));
PYSPARK
df.groupBy("col1",
"col2").agg(sum("col3"))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
ROLLUP
SQL
SELECT col1, col2, SUM(col3)
FROM table
GROUP BY ROLLUP(col1, col2);
PYSPARK
df.rollup("col1", "col2") \
.agg(sum("col3"))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
CUBE
SQL
SELECT col1, col2, SUM(col3)
FROM table GROUP BY CUBE(col1, col2);
PYSPARK
df.cube("col1", "col2") \
.agg(sum("col3"))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
RANK FUNCTION
SQL
SELECT col1,
RANK() OVER (ORDER BY col2) AS rank
FROM table;
PYSPARK
df.withColumn("rank", rank() \
.over(Window.orderBy("col2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.withColumn("dense_rank",
dense_rank().over(Window.orderBy("col2")))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.withColumn("running_total", sum("col2")\
.over(Window.orderBy("col1") \
.rowsBetween(Window.unboundedPreceding,
Window.currentRow)))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.select(datediff("date1", "date2"))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.select(upper("col1"), lower("col2"))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.filter(df.col1.isin(value1, value2))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.filter(df.col1.between(value1, value2))
Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark
PYSPARK
df.orderBy("col1", df.col2.desc())
Shwetank Singh
GritSetGrow - GSGLearn.com