import pyodbc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
conn = pyodbc.connect("Driver={SQL Server Native Client 11.0};"
"Server=localhost;"
"Database=Superstore;"
"Trusted_Connection=yes"
)
orders = pd.read_sql_query(
'''Select *
,datefromparts(year(order_date),month(order_date),1) Order_Date_Trunc
,datediff(day,order_date,ship_date) Time_to_Ship
from orders'''
, conn)
# person = pd.read_sql_query(
# '''Select *
# from person'''
# , conn)
conn.close()
orders.describe()
Row_ID | Postal_Code | Sales | Quantity | Discount | Profit | Time_to_Ship | |
---|---|---|---|---|---|---|---|
count | 9994.000000 | 9983.000000 | 9994.000000 | 9994.000000 | 9994.000000 | 9993.000000 | 9994.000000 |
mean | 4997.500000 | 55245.233297 | 229.858000 | 3.789574 | 0.156203 | 28.701793 | 3.958275 |
std | 2885.163629 | 32038.715955 | 623.245098 | 2.225110 | 0.206452 | 234.228826 | 1.747598 |
min | 1.000000 | 1040.000000 | 0.444000 | 1.000000 | 0.000000 | -6599.978027 | 0.000000 |
25% | 2499.250000 | 23223.000000 | 17.280001 | 2.000000 | 0.000000 | 1.731000 | 3.000000 |
50% | 4997.500000 | 57103.000000 | 54.490000 | 3.000000 | 0.200000 | 8.671000 | 4.000000 |
75% | 7495.750000 | 90008.000000 | 209.940002 | 5.000000 | 0.200000 | 29.364000 | 5.000000 |
max | 9994.000000 | 99301.000000 | 22638.480469 | 14.000000 | 0.800000 | 8399.975586 | 7.000000 |
Discount appears to be a percent. We may be able to determine if the discount is applied before or after 'Sales'
orders.nunique()
Row_ID 9994 Order_ID 5009 Order_Date 1236 Ship_Date 1334 Ship_Mode 4 Customer_ID 793 Customer_Name 793 Segment 3 Country_Region 1 City 531 State 49 Postal_Code 630 Region 4 Product_ID 1862 Category 3 Sub_Category 17 Product_Name 1849 Sales 5825 Quantity 14 Discount 12 Profit 7286 Order_Date_Trunc 48 Time_to_Ship 8 dtype: int64
It looks like there are a handful that might be useful. We will look at anything with less than five unique values and figure out what those values are
#Create list of columns
order_group = orders.loc[:,[col for col in orders.columns if (orders[col].nunique() <= 5 and orders[col].nunique() >1)]]
#Show unique values for each column in the list
for col in order_group.columns:
print(col, ':', orders[col].unique())
Ship_Mode : ['Second Class' 'Standard Class' 'First Class' 'Same Day'] Segment : ['Consumer' 'Corporate' 'Home Office'] Region : ['South' 'West' 'Central' 'East'] Category : ['Furniture' 'Office Supplies' 'Technology']
'Ship Mode' has four different categories which may be correlated with an amount of time. Let's look at that distribution
#Create df to count orders, grouped by time to ship
orders_by_ship_time = orders.groupby(['Time_to_Ship']).agg(
order_ct = pd.NamedAgg(column='Order_ID', aggfunc='count')
).reset_index()
#Plot bar graph
orders_by_ship_time.iloc[:,1:].plot.bar(figsize=(10,6))
plt.xlabel('Time to Ship (Days)')
plt.title('Orders by Time to Ship')
plt.tight_layout()
Again, this looks pretty normal, but I wonder if there's a correlation to 'Ship_Mode'
#Create df to count orders, grouped by count of order_id
orders_by_time_mode = orders.groupby(['Time_to_Ship','Ship_Mode']).agg(
order_ct = pd.NamedAgg(column='Order_ID', aggfunc='count')
).reset_index()
#Create pivot to split out 'Ship_Mode'
pivot = orders_by_time_mode.pivot(index = 'Time_to_Ship', columns='Ship_Mode', values='order_ct').reset_index().fillna(0)
#Plot stacked bar
pivot.iloc[:,1:].plot.bar(stacked=True,figsize=(10,6))
plt.xlabel('Time to Ship (Days)')
plt.title('Orders by Time to Ship and Ship Mode')
plt.tight_layout()
This may be worth showing in a visual because there appears to be some disrepancy in shipping time.
#Create df to count orders, grouped by month of the order
orders_by_date = orders.groupby(['Order_Date_Trunc']).agg(
order_ct = pd.NamedAgg(column='Order_ID', aggfunc='count')
).reset_index()
#Plot line graph
plt.subplots(figsize=(10,5))
plt.plot(orders_by_date['Order_Date_Trunc'],orders_by_date['order_ct'])
plt.title('Orders by Date')
plt.tight_layout()
There is a definite seasonal pattern, though it appears orders have increased year over year
We saw eariler in the description (cell 3) that the minimum profit was -$6,600. I'd like to see how prevalent negative profit is
#isolate the columns we want to use
order_profit = orders.loc[:,['Order_ID','Profit','Discount']]
#add flag for negative orders
order_profit.loc[:,'Negative_Order'] = np.where(order_profit['Profit'] < 0, 1, 0)
#create statement about what percent of orders are negative profit
print('There are',order_profit['Negative_Order'].sum(),'negative profit items. These account for', round((order_profit['Negative_Order'].sum()/order_profit['Negative_Order'].count())*100,2),'% of all items.')
There are 1870 negative profit items. These account for 18.71 % of all items.
Nearly 20% of all items have a negative profit. This is probably something we will want to report on in our visualization. I'm wondering if all of these are due to discounts. Let's find out...
#add flag for negative orders with discount
order_profit.loc[:,'Neg_W_Disc'] = np.where((order_profit['Profit'] < 0) & (order_profit['Discount'] > 0), 1, 0)
#create statement about what percent of orders are negative profit
print('There are',order_profit['Neg_W_Disc'].sum(),'negative profit items with discounts. These account for', round((order_profit['Neg_W_Disc'].sum()/order_profit['Negative_Order'].sum())*100,2),'% of all items.')
There are 1870 negative profit items with discounts. These account for 100.0 % of all items.
So all negative profit lines are based on discount. Let's roll this up to the overall order_id and see if we still see negative profit
#Create df to count orders, grouped by month of the order
orders_by_profit = orders.groupby(['Order_ID']).agg(
Item_ct = pd.NamedAgg(column='Row_ID', aggfunc='count'),
Profit = pd.NamedAgg(column='Profit', aggfunc=sum),
Max_Discount = pd.NamedAgg(column='Discount', aggfunc=max),
).reset_index()
#add flag for negative orders
orders_by_profit.loc[:,'Negative_Order'] = np.where(orders_by_profit['Profit'] < 0, 1, 0)
#create statement about what percent of orders are negative profit
print('There are',orders_by_profit['Negative_Order'].sum(),'negative orders. These account for', round((orders_by_profit['Negative_Order'].sum()/orders_by_profit['Negative_Order'].count())*100,2),'% of all orders.')
There are 1022 negative orders. These account for 20.4 % of all orders.
We are seeing a higher percentage of orders than items. This is logical as there are likely many orders with one discount applied to the entire order.