import pyodbc 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
conn = pyodbc.connect("Driver={SQL Server Native Client 11.0};"
                        "Server=localhost;"
                        "Database=Superstore;"
                        "Trusted_Connection=yes"
                       )


orders = pd.read_sql_query(
    '''Select * 
       ,datefromparts(year(order_date),month(order_date),1) Order_Date_Trunc
       ,datediff(day,order_date,ship_date) Time_to_Ship
    from orders'''
    , conn)
# person = pd.read_sql_query(
#     '''Select * 
#     from person'''
#     , conn)
conn.close()


orders.describe()


orders.nunique()

Row_ID              9994
Order_ID            5009
Order_Date          1236
Ship_Date           1334
Ship_Mode              4
Customer_ID          793
Customer_Name        793
Segment                3
Country_Region         1
City                 531
State                 49
Postal_Code          630
Region                 4
Product_ID          1862
Category               3
Sub_Category          17
Product_Name        1849
Sales               5825
Quantity              14
Discount              12
Profit              7286
Order_Date_Trunc      48
Time_to_Ship           8
dtype: int64


#Create list of columns
order_group = orders.loc[:,[col for col in orders.columns if (orders[col].nunique() <= 5 and orders[col].nunique() >1)]]

#Show unique values for each column in the list
for col in order_group.columns:
    print(col, ':', orders[col].unique())

Ship_Mode : ['Second Class' 'Standard Class' 'First Class' 'Same Day']
Segment : ['Consumer' 'Corporate' 'Home Office']
Region : ['South' 'West' 'Central' 'East']
Category : ['Furniture' 'Office Supplies' 'Technology']


#Create df to count orders, grouped by time to ship
orders_by_ship_time = orders.groupby(['Time_to_Ship']).agg(
    order_ct = pd.NamedAgg(column='Order_ID', aggfunc='count')
).reset_index()

#Plot bar graph
orders_by_ship_time.iloc[:,1:].plot.bar(figsize=(10,6))
plt.xlabel('Time to Ship (Days)')
plt.title('Orders by Time to Ship')
plt.tight_layout()


#Create df to count orders, grouped by count of order_id
orders_by_time_mode = orders.groupby(['Time_to_Ship','Ship_Mode']).agg(
    order_ct = pd.NamedAgg(column='Order_ID', aggfunc='count')
).reset_index()
#Create pivot to split out 'Ship_Mode'
pivot = orders_by_time_mode.pivot(index = 'Time_to_Ship', columns='Ship_Mode', values='order_ct').reset_index().fillna(0)

#Plot stacked bar
pivot.iloc[:,1:].plot.bar(stacked=True,figsize=(10,6))
plt.xlabel('Time to Ship (Days)')
plt.title('Orders by Time to Ship and Ship Mode')
plt.tight_layout()


#Create df to count orders, grouped by month of the order
orders_by_date = orders.groupby(['Order_Date_Trunc']).agg(
    order_ct = pd.NamedAgg(column='Order_ID', aggfunc='count')
).reset_index()

#Plot line graph
plt.subplots(figsize=(10,5))
plt.plot(orders_by_date['Order_Date_Trunc'],orders_by_date['order_ct'])
plt.title('Orders by Date')
plt.tight_layout()


#isolate the columns we want to use
order_profit = orders.loc[:,['Order_ID','Profit','Discount']]

#add flag for negative orders
order_profit.loc[:,'Negative_Order'] =  np.where(order_profit['Profit'] < 0, 1, 0)

#create statement about what percent of orders are negative profit
print('There are',order_profit['Negative_Order'].sum(),'negative profit items. These account for', round((order_profit['Negative_Order'].sum()/order_profit['Negative_Order'].count())*100,2),'% of all items.')

There are 1870 negative profit items. These account for 18.71 % of all items.


#add flag for negative orders with discount
order_profit.loc[:,'Neg_W_Disc'] =  np.where((order_profit['Profit'] < 0)  & (order_profit['Discount'] > 0), 1, 0)

#create statement about what percent of orders are negative profit
print('There are',order_profit['Neg_W_Disc'].sum(),'negative profit items with discounts. These account for', round((order_profit['Neg_W_Disc'].sum()/order_profit['Negative_Order'].sum())*100,2),'% of all items.')

There are 1870 negative profit items with discounts. These account for 100.0 % of all items.


#Create df to count orders, grouped by month of the order
orders_by_profit = orders.groupby(['Order_ID']).agg(
    Item_ct = pd.NamedAgg(column='Row_ID', aggfunc='count'),
    Profit = pd.NamedAgg(column='Profit', aggfunc=sum),
    Max_Discount = pd.NamedAgg(column='Discount', aggfunc=max),
).reset_index()

#add flag for negative orders
orders_by_profit.loc[:,'Negative_Order'] =  np.where(orders_by_profit['Profit'] < 0, 1, 0)

#create statement about what percent of orders are negative profit
print('There are',orders_by_profit['Negative_Order'].sum(),'negative orders. These account for', round((orders_by_profit['Negative_Order'].sum()/orders_by_profit['Negative_Order'].count())*100,2),'% of all orders.')

There are 1022 negative orders. These account for 20.4 % of all orders.

	Row_ID	Postal_Code	Sales	Quantity	Discount	Profit	Time_to_Ship
count	9994.000000	9983.000000	9994.000000	9994.000000	9994.000000	9993.000000	9994.000000
mean	4997.500000	55245.233297	229.858000	3.789574	0.156203	28.701793	3.958275
std	2885.163629	32038.715955	623.245098	2.225110	0.206452	234.228826	1.747598
min	1.000000	1040.000000	0.444000	1.000000	0.000000	-6599.978027	0.000000
25%	2499.250000	23223.000000	17.280001	2.000000	0.000000	1.731000	3.000000
50%	4997.500000	57103.000000	54.490000	3.000000	0.200000	8.671000	4.000000
75%	7495.750000	90008.000000	209.940002	5.000000	0.200000	29.364000	5.000000
max	9994.000000	99301.000000	22638.480469	14.000000	0.800000	8399.975586	7.000000

Tableau's Superstore Dataset Exploratory Data Analysis (EDA)

Load Dependencies and Data

Dependencies and connection to SQL

Load data from SQL

Exploratory Data Analysis

Describe Columns

Check unique values

List all Columns with two to five unique values and show those values

Plot 'Time to Ship' (ship_date - order_date)

Check distributions of orders

Explore orders with negative profit lines

Check profit at order_id level