-- Data source from https://www.sqlservertutorial.net/sql-server-sample-database/
-- Objectives
-- 1. Forecast revenue for the next 6 months based on available data.
-- 2. Identify potential seasonal patterns in purchases.
-- 3. Identify key drivers of revenue.
-- Getting to know the data
-- Date range
SELECT MAX(order_date), MIN(order_date)
FROM sales.orders;
-- List of columns relevant to analysis(
-- brand_name, *good
-- category_name, *good
-- product_name, *good
-- model_year, *good
-- city, *good
-- state, *good
-- order_id, *good
-- store_name, *good
-- staff_id, *good
-- quantity, *good
-- list_price, *good
-- discount, *good
-- order_date) *good
-- Evaluating our tables to see if any pre cleaning needs to be done prior to joining and
-- checking columns off as good to join in our list above.
-- Using an occasional DISTINCT statement when curious. Looking out for obvious errors
-- in each table. Being cautious of anything that might affect my analysis.
SELECT *
FROM sales.store;
SELECT *
FROM
production.brand;
SELECT *
FROM
production.category;
SELECT *
FROM production.product;
SELECT DISTINCT product_id, product_name
FROM production.product;
SELECT *
FROM sales.customer;
SELECT DISTINCT city, state
FROM sales.customer;
SELECT *
FROM sales.orders;
SELECT *
FROM sales.staff;
SELECT *
FROM sales.order_item;
-- Joining relevant columns
SELECT
b.brand_name,
cat.category_name,
p.product_name,
p.model_year,
cus.city,
cus.state,
o.order_id,
o.order_date,
s.store_name,
staff.staff_id,
oi.quantity,
oi.list_price,
oi.discount
FROM
sales.orders o
LEFT JOIN
sales.order_item oi ON oi.order_id = o.order_id
LEFT JOIN
sales.staff staff ON staff.staff_id = o.staff_id
LEFT JOIN
sales.store s ON s.store_id = staff.store_id
LEFT JOIN
sales.customer cus ON cus.customer_id = o.customer_id
LEFT JOIN
production.product p ON p.product_id = oi.product_id
LEFT JOIN
production.category cat ON cat.category_id = p.category_id
LEFT JOIN
production.brand b ON b.brand_id = p.brand_id;
-- Output represents desired columns and a more granular viewpoint of purchases. Each
-- order has been broken up into rows that show exactly what product was
-- purchased by quantity for every order.
-- Double checking previous output. Previous output should have same number of rows as
-- the order_item table.
SELECT *
FROM sales.order_item
-- Confirmed
-- Placing previous JOIN statement into a new table.
SELECT
b.brand_name,
cat.category_name,
p.product_name,
p.model_year,
cus.city,
cus.state,
o.order_id,
o.order_date,
s.store_name,
staff.staff_id,
oi.quantity,
oi.list_price,
oi.discount
INTO bike_orders
FROM
sales.orders o
LEFT JOIN
sales.order_item oi ON oi.order_id = o.order_id
LEFT JOIN
sales.staff staff ON staff.staff_id = o.staff_id
LEFT JOIN
sales.store s ON s.store_id = staff.store_id
LEFT JOIN
sales.customer cus ON cus.customer_id = o.customer_id
LEFT JOIN
production.product p ON p.product_id = oi.product_id
LEFT JOIN
production.category cat ON cat.category_id = p.category_id
LEFT JOIN
production.brand b ON b.brand_id = p.brand_id;
-- Beginning cleaning process
-- Removing leading and trailing whitespaces
-- I used the SSMS GUI to view all the datatypes before writing the following statement.
UPDATE bike_orders
SET
brand_name = TRIM(brand_name),
category_name = TRIM(category_name),
product_name = TRIM(product_name),
city = TRIM(city),
state = TRIM(state),
store_name = TRIM(store_name);
-- Visual inspection
SELECT *
FROM bike_orders;
-- Checking for duplicates
SELECT DISTINCT *
FROM bike_orders
-- No duplicates found
-- Fix Format Issues
-- Up to this point, the quality of the data has looked pretty good. If anything comes
-- up throughout the rest of the process, I will address it.
-- Change Data Types
-- Data types are solid.
-- Fill or Remove Nulls
SELECT *
FROM bike_orders
WHERE
brand_name IS NULL OR
category_name IS NULL OR
product_name IS NULL OR
model_year IS NULL OR
city IS NULL OR
state IS NULL OR
order_id IS NULL OR
order_date IS NULL OR
store_name IS NULL OR
staff_id IS NULL OR
quantity IS NULL OR
list_price IS NULL OR
discount IS NULL;
-- There doesnt appear to be null values. I'm going to verify with 2 more statements
SELECT *
FROM bike_orders
WHERE discount IS NULL;
SELECT *
FROM bike_orders
WHERE city IS NULL;
-- No null values present
-- Handle Outliers
-- This is a fairly simple dataset so I will visually inspect first and only use a
-- statistical approach if necessary.
SELECT
MAX(model_year) AS maxyear,
MIN(model_year) AS minyear,
MAX(quantity) AS maxquantity,
MIN(quantity) AS minquantity,
MAX(list_price) AS maxprice,
MIN(list_price) AS minprice,
MAX(discount) AS maxdiscount,
MIN(discount) AS mindiscount
FROM bike_orders;
-- Maxquantity is 2 which seems low.
-- Maxprice is about 12k which seems high.
-- Evaluating
SELECT *
FROM bike_orders
ORDER BY quantity DESC;
-- The max quantity is reasonable considering it represents quantity by product. So
-- a customer might buy 10 bikes in one order and not get 2 of the exact same bike.
SELECT *
FROM bike_orders
ORDER BY list_price DESC;
-- The bike priced at about 12k isnt a cause for concern. Many high ticket bikes exist
-- in our table.
-- Standardize Data
-- Unit measurements are already consistent.
-- Validate data
-- Reviewed cleaning process and there are no causes for concern.
SELECT *
FROM bike_orders
-- Data cleaning complete
-- Reviewing objectives
-- Feature engineering for objective 2 (Identify potential seasonal patterns in purchases.)
-- Adding order_month column
-- ALTER TABLE bike_orders
-- ADD order_month VARCHAR;
-- Assigning month values to order_month column
-- UPDATE bike_orders
-- SET order_month = MONTH(order_date);
-- Verifying
-- SELECT *
-- FROM bike_orders;
-- Values over 9 received a * value.
-- Verifying
-- SELECT *
-- FROM bike_orders
-- WHERE order_month > 9;
-- Error: Conversion failed when converting the varchar value '*' to data type int.
-- I should have added the column as INT and not VARCHAR. I falsely expected a string
-- version like "Jan".
-- Dropping column
-- ALTER TABLE bike_orders
-- DROP COLUMN order_month;
-- Adding column back as INT
ALTER TABLE bike_orders
ADD order_month INT;
-- Assigning month values to order_month column
UPDATE bike_orders
SET order_month = MONTH(order_date);
-- Verifying
SELECT *
FROM bike_orders;
-- Issue resolved
-- Feature engineering for objective 3 (Identify key drivers of revenue.)
-- Adding revenue column
ALTER TABLE bike_orders
ADD revenue float;
-- Assigning values to revenue column
UPDATE bike_orders
SET revenue = (quantity * list_price) - (quantity * discount);
-- Verifying
SELECT *
FROM bike_orders;
-- Column is good
-- Data preparation complete