Data Engineering 101 SQL and PySpark 1727161935

Download as pdf or txt
Download as pdf or txt
You are on page 1of 58

Data

Engineering 101
SQL and
PySpark

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

SELECT ALL COLUMNS


SQL

SELECT * FROM table;

PYSPARK

df.select("*")

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

SELECT SPECIFIC COLUMNS


SQL

SELECT col1, col2 FROM table;

PYSPARK

df.select("col1", "col2")

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

FILTERING ROWS (WHERE CLAUSE)


SQL

SELECT * FROM table WHERE condition;

PYSPARK

df.filter("condition")

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

ORDERING ROWS
SQL

SELECT * FROM table ORDER BY col1;

PYSPARK

df.orderBy("col1")

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

ORDERING ROWS DESCENDING


SQL

SELECT * FROM table ORDER BY col1 DESC;

PYSPARK

df.orderBy(df.col1.desc())

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

LIMITING ROWS
SQL

SELECT * FROM table LIMIT 10;

PYSPARK

df.limit(10)

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

SELECTING DISTINCT VALUES


SQL

SELECT DISTINCT col1 FROM table;

PYSPARK

df.select("col1").distinct()

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

GROUP BY AND AGGREGATE


(COUNT)
SQL

SELECT col1, COUNT(*) FROM table GROUP


BY col1;

PYSPARK

df.groupBy("col1").count()

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

GROUP BY AND AGGREGATE (AVG)


SQL

SELECT col1, AVG(col2) FROM table GROUP


BY col1;

PYSPARK

df.groupBy("col1").avg("col2")

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

INNER JOIN
SQL
SELECT *
FROM table1
INNER JOIN table2
ON table1.id = table2.id;

PYSPARK

df1.join(df2, df1.id == df2.id, "inner")

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

LEFT JOIN
SQL

SELECT * FROM table1


LEFT JOIN table2 ON table1.id = table2.id;

PYSPARK

df1.join(df2, df1.id == df2.id, "left")

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

RIGHT JOIN
SQL
SELECT * FROM table1
RIGHT JOIN table2
ON table1.id = table2.id;

PYSPARK

df1.join(df2, df1.id == df2.id, "right")

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

FULL OUTER JOIN


SQL
SELECT * FROM table1
FULL JOIN table2
ON table1.id = table2.id;

PYSPARK

df1.join(df2, df1.id == df2.id, "outer")

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

SUBQUERIES
SQL
SELECT * FROM (SELECT col1, col2 FROM
table) sub_table;

PYSPARK

sub_df = df.select("col1", "col2")

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

CASE STATEMENTS
SQL
SELECT col1,
CASE WHEN condition
THEN result ELSE result2 END
FROM table;

PYSPARK
df.select("col1", when(condition,
result).otherwise(result2).alias("new_col"))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

WINDOW FUNCTIONS (ROW NUMBER)


SQL
SELECT col1,
ROW_NUMBER() OVER (ORDER BY col2) AS row_num
FROM table;

PYSPARK

df.withColumn("row_num",
row_number().over(Window.orderBy("col2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

WINDOW FUNCTIONS (AGGREGATIONS)


SQL
SELECT col1,
SUM(col2) OVER (PARTITION BY col3) AS sum_col2
FROM table;

PYSPARK

df.withColumn("sum_col2", sum("col2") \
.over(Window.partitionBy("col3")))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

LAG FUNCTION
SQL
SELECT col1,
LAG(col2, 1) OVER (ORDER BY col3)
AS lag_col2 FROM table;

PYSPARK

df.withColumn("lag_col2", lag("col2", 1) \
.over(Window.orderBy("col3")))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

LEAD FUNCTION
SQL
SELECT col1,
LEAD(col2, 1) OVER (ORDER BY col3) AS lead_col2
FROM table;

PYSPARK

df.withColumn("lead_col2", lead("col2", 1) \
.over(Window.orderBy("col3")))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

HANDLING NULLS (IS NULL)


SQL

SELECT * FROM table WHERE col1 IS NULL;

PYSPARK

df.filter(df.col1.isNull())

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

HANDLING NULLS (IS NOT NULL)


SQL
SELECT * FROM table WHERE col1
IS NOT NULL;

PYSPARK

df.filter(df.col1.isNotNull())

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

UNION OF TWO TABLES


SQL
SELECT * FROM table1
UNION
SELECT * FROM table2;

PYSPARK

df1.union(df2)

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

INTERSECT OF TWO TABLES


SQL
SELECT * FROM table1
INTERSECT
SELECT * FROM table2;

PYSPARK

df1.intersect(df2)

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

EXCEPT (DIFFERENCE) OF TWO TABLES


SQL
SELECT * FROM table1
EXCEPT
SELECT * FROM table2;

PYSPARK

df1.subtract(df2)

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

CREATING TEMPORARY VIEW


SQL
CREATE TEMP VIEW temp_table
AS SELECT * FROM table;

PYSPARK

df.createOrReplaceTempView("temp_table")

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

USING SQL QUERIES ON DATAFRAMES


SQL

SELECT * FROM temp_table;

PYSPARK

spark.sql("SELECT * FROM temp_table")

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

PIVOTING DATA
SQL
SELECT * FROM
(SELECT col1, col2 FROM table)
PIVOT (SUM(col2) FOR col1 IN ('value1', 'value2'));

PYSPARK

df.groupBy()\
.pivot("col1", ['value1', 'value2']).sum("col2")

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

UPDATING ROWS
SQL
UPDATE table SET col1 = value WHERE
condition;

PYSPARK

df = df.withColumn("col1", when(condition,
value).otherwise(df.col1))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

DELETING ROWS
SQL

DELETE FROM table WHERE condition;

PYSPARK

df = df.filter(~condition)

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

HANDLING DUPLICATES
SQL
SELECT col1, COUNT(*) FROM table
GROUP BY col1 HAVING COUNT(*) > 1;

PYSPARK

df.groupBy("col1").count().filter("count > 1")

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

CALCULATING PERCENTAGE
SQL
SELECT col1, (col2 / col3) * 100 AS percentage
FROM table;

PYSPARK

df.withColumn("percentage", (df.col2 /
df.col3) * 100)

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

STRING FUNCTIONS (CONCATENATION)


SQL
SELECT CONCAT(col1, col2) AS new_col
FROM table;

PYSPARK

df.select(concat("col1", "col2") \
.alias("new_col"))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

DATE FUNCTIONS (CURRENT DATE)


SQL

SELECT CURRENT_DATE AS today;

PYSPARK

df.select(current_date().alias("today"))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

EXTRACTING YEAR FROM DATE


SQL
SELECT EXTRACT(YEAR FROM date_col) AS year
FROM table;

PYSPARK

df.select(year("date_col").alias("year"))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

CONDITIONAL AGGREGATION
SQL
SELECT SUM(CASE WHEN condition THEN col1 ELSE 0 END)
FROM table;

PYSPARK

df.select(sum(when(condition, df.col1) \
.otherwise(0)))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

RENAMING COLUMNS
SQL

SELECT col1 AS new_col1 FROM table;

PYSPARK

df.withColumnRenamed("col1", "new_col1")

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

DROPPING COLUMNS
SQL

ALTER TABLE table DROP COLUMN col1;

PYSPARK

df.drop("col1")

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

ADDING NEW COLUMNS


SQL
ALTER TABLE table
ADD COLUMN new_col data_type;

PYSPARK

df.withColumn("new_col", expression)

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

REPLACING VALUES
SQL

UPDATE table SET col1 = new_value


WHERE condition;

PYSPARK

df.withColumn("col1", when(condition,
new_value).otherwise(df.col1))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

USING UDFS (USER-DEFINED FUNCTIONS)


SQL
CREATE FUNCTION my_udf AS ...;
SELECT my_udf(col1) FROM table;

PYSPARK

df.select(my_udf("col1"))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

EXPLODING ARRAYS
SQL
SELECT col1, EXPLODE(array_col) AS exploded_col
FROM table;

PYSPARK

df.select("col1", explode("array_col") \
.alias("exploded_col"))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

FLATTENING NESTED DATA


SQL

SELECT col1, nested_col.* FROM table;

PYSPARK

df.select("col1", "nested_col.*")

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

CASTING DATA TYPES


SQL
SELECT CAST(col1 AS data_type)
FROM table;

PYSPARK

df.select(df.col1.cast("data_type"))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

HANDLING JSON DATA


SQL
SELECT JSON_VALUE(json_col, '$.key')
FROM table;

PYSPARK

df.select(get_json_object("json_col", "$.key"))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

GROUPING SETS
SQL
SELECT col1, col2, SUM(col3)
FROM table
GROUP BY GROUPING SETS ((col1), (col2));

PYSPARK

df.groupBy("col1",
"col2").agg(sum("col3"))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

ROLLUP
SQL
SELECT col1, col2, SUM(col3)
FROM table
GROUP BY ROLLUP(col1, col2);

PYSPARK

df.rollup("col1", "col2") \
.agg(sum("col3"))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

CUBE
SQL
SELECT col1, col2, SUM(col3)
FROM table GROUP BY CUBE(col1, col2);

PYSPARK

df.cube("col1", "col2") \
.agg(sum("col3"))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

RANK FUNCTION
SQL
SELECT col1,
RANK() OVER (ORDER BY col2) AS rank
FROM table;

PYSPARK

df.withColumn("rank", rank() \
.over(Window.orderBy("col2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

DENSE RANK FUNCTION


SQL
SELECT col1,
DENSE_RANK() OVER (ORDER BY col2) AS dense_rank
FROM table;

PYSPARK

df.withColumn("dense_rank",
dense_rank().over(Window.orderBy("col2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

CUMULATIVE SUM (RUNNING TOTAL)


SQL
SELECT col1,
SUM(col2) OVER (ORDER BY col1) AS running_total
FROM table;

PYSPARK
df.withColumn("running_total", sum("col2")\
.over(Window.orderBy("col1") \
.rowsBetween(Window.unboundedPreceding,
Window.currentRow)))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

HANDLING DATES (DATE DIFFERENCE)


SQL

SELECT DATEDIFF(date1, date2) FROM table;

PYSPARK

df.select(datediff("date1", "date2"))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

STRING FUNCTIONS (SUBSTRING)


SQL
SELECT SUBSTRING(col1, start, length)
FROM table;

PYSPARK

df.select(substring("col1", start, length))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

UPPER AND LOWER CASE CONVERSION


SQL
SELECT UPPER(col1), LOWER(col2)
FROM table;

PYSPARK

df.select(upper("col1"), lower("col2"))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

FILTER WITH IN CLAUSE


SQL
SELECT * FROM table
WHERE col1 IN (value1, value2);

PYSPARK

df.filter(df.col1.isin(value1, value2))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

FILTER WITH BETWEEN CLAUSE


SQL
SELECT * FROM table
WHERE col1 BETWEEN value1 AND value2;

PYSPARK

df.filter(df.col1.between(value1, value2))

Shwetank Singh
GritSetGrow - GSGLearn.com
Data Engineering 101: SQL and PySpark

ORDER BY MULTIPLE COLUMNS


SQL
SELECT * FROM table
ORDER BY col1, col2 DESC;

PYSPARK

df.orderBy("col1", df.col2.desc())

Shwetank Singh
GritSetGrow - GSGLearn.com

You might also like