1978 lines
84 KiB
Text
Executable file
1978 lines
84 KiB
Text
Executable file
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Session 1 - DataFrames - Lesson 9: Pivot Tables and Data Reshaping\n",
|
|
"\n",
|
|
"## Learning Objectives\n",
|
|
"- Master pivot table creation and customization\n",
|
|
"- Understand data reshaping with melt, pivot, stack, and unstack\n",
|
|
"- Learn cross-tabulation and contingency tables\n",
|
|
"- Practice with multi-level indexing and hierarchical data\n",
|
|
"- Apply reshaping techniques to real-world analysis scenarios\n",
|
|
"\n",
|
|
"## Prerequisites\n",
|
|
"- Completed Lessons 1-8\n",
|
|
"- Understanding of aggregation and groupby operations\n",
|
|
"- Familiarity with Excel pivot tables (helpful but not required)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Libraries loaded successfully!\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Import required libraries\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"from datetime import datetime, timedelta\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import seaborn as sns\n",
|
|
"import warnings\n",
|
|
"warnings.filterwarnings('ignore')\n",
|
|
"\n",
|
|
"# Set display options\n",
|
|
"pd.set_option('display.max_columns', None)\n",
|
|
"pd.set_option('display.max_rows', 20)\n",
|
|
"pd.set_option('display.width', None)\n",
|
|
"\n",
|
|
"print(\"Libraries loaded successfully!\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Creating Sample Dataset\n",
|
|
"\n",
|
|
"Let's create a comprehensive business dataset for pivot table examples."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Business dataset created:\n",
|
|
"Shape: (1000, 20)\n",
|
|
"\n",
|
|
"First few rows:\n",
|
|
" date salesperson region product_category product customer_type \\\n",
|
|
"0 2024-01-01 Grace Lee North Clothing Shoes Returning \n",
|
|
"1 2024-01-02 Diana Prince East Clothing Magazine Returning \n",
|
|
"2 2024-01-03 Henry Davis West Books Shoes VIP \n",
|
|
"3 2024-01-04 Eve Wilson West Clothing Novel New \n",
|
|
"4 2024-01-05 Grace Lee East Clothing Laptop VIP \n",
|
|
"\n",
|
|
" sales_channel quantity unit_price discount_percent shipping_cost \\\n",
|
|
"0 Online 1 72.81 0 16.13 \n",
|
|
"1 Store 9 99.98 15 10.42 \n",
|
|
"2 Online 6 4.61 0 8.73 \n",
|
|
"3 Phone 8 31.51 0 8.27 \n",
|
|
"4 Store 3 86.28 15 11.61 \n",
|
|
"\n",
|
|
" gross_sales discount_amount net_sales total_order year month quarter \\\n",
|
|
"0 72.81 0.000 72.810 88.940 2024 1 1 \n",
|
|
"1 899.82 134.973 764.847 775.267 2024 1 1 \n",
|
|
"2 27.66 0.000 27.660 36.390 2024 1 1 \n",
|
|
"3 252.08 0.000 252.080 260.350 2024 1 1 \n",
|
|
"4 258.84 38.826 220.014 231.624 2024 1 1 \n",
|
|
"\n",
|
|
" day_of_week month_name \n",
|
|
"0 Monday January \n",
|
|
"1 Tuesday January \n",
|
|
"2 Wednesday January \n",
|
|
"3 Thursday January \n",
|
|
"4 Friday January \n",
|
|
"\n",
|
|
"Column info:\n",
|
|
"date datetime64[ns]\n",
|
|
"salesperson object\n",
|
|
"region object\n",
|
|
"product_category object\n",
|
|
"product object\n",
|
|
"customer_type object\n",
|
|
"sales_channel object\n",
|
|
"quantity int64\n",
|
|
"unit_price float64\n",
|
|
"discount_percent int64\n",
|
|
"shipping_cost float64\n",
|
|
"gross_sales float64\n",
|
|
"discount_amount float64\n",
|
|
"net_sales float64\n",
|
|
"total_order float64\n",
|
|
"year int32\n",
|
|
"month int32\n",
|
|
"quarter int32\n",
|
|
"day_of_week object\n",
|
|
"month_name object\n",
|
|
"dtype: object\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Create comprehensive business dataset\n",
|
|
"np.random.seed(42)\n",
|
|
"n_records = 1000\n",
|
|
"\n",
|
|
"# Generate realistic business data\n",
|
|
"business_data = {\n",
|
|
" 'date': pd.date_range('2024-01-01', periods=n_records, freq='D'),\n",
|
|
" 'salesperson': np.random.choice([\n",
|
|
" 'Alice Johnson', 'Bob Smith', 'Charlie Brown', 'Diana Prince', 'Eve Wilson',\n",
|
|
" 'Frank Miller', 'Grace Lee', 'Henry Davis', 'Ivy Chen', 'Jack Robinson'\n",
|
|
" ], n_records),\n",
|
|
" 'region': np.random.choice(['North', 'South', 'East', 'West'], n_records),\n",
|
|
" 'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home & Garden'], n_records),\n",
|
|
" 'product': np.random.choice([\n",
|
|
" 'Laptop', 'Phone', 'Tablet', 'Headphones', 'Speaker',\n",
|
|
" 'Shirt', 'Pants', 'Shoes', 'Jacket', 'Hat',\n",
|
|
" 'Novel', 'Textbook', 'Magazine', 'Comic', 'Cookbook',\n",
|
|
" 'Plant', 'Tool', 'Furniture', 'Decoration', 'Garden'\n",
|
|
" ], n_records),\n",
|
|
" 'customer_type': np.random.choice(['New', 'Returning', 'VIP'], n_records, p=[0.3, 0.5, 0.2]),\n",
|
|
" 'sales_channel': np.random.choice(['Online', 'Store', 'Phone'], n_records, p=[0.6, 0.3, 0.1]),\n",
|
|
" 'quantity': np.random.randint(1, 10, n_records),\n",
|
|
" 'unit_price': np.random.normal(50, 20, n_records),\n",
|
|
" 'discount_percent': np.random.choice([0, 5, 10, 15, 20], n_records, p=[0.5, 0.2, 0.2, 0.08, 0.02]),\n",
|
|
" 'shipping_cost': np.random.normal(8, 3, n_records)\n",
|
|
"}\n",
|
|
"\n",
|
|
"df_business = pd.DataFrame(business_data)\n",
|
|
"\n",
|
|
"# Clean and calculate derived fields\n",
|
|
"df_business['unit_price'] = np.abs(df_business['unit_price']).round(2)\n",
|
|
"df_business['shipping_cost'] = np.abs(df_business['shipping_cost']).round(2)\n",
|
|
"df_business['gross_sales'] = df_business['quantity'] * df_business['unit_price']\n",
|
|
"df_business['discount_amount'] = df_business['gross_sales'] * df_business['discount_percent'] / 100\n",
|
|
"df_business['net_sales'] = df_business['gross_sales'] - df_business['discount_amount']\n",
|
|
"df_business['total_order'] = df_business['net_sales'] + df_business['shipping_cost']\n",
|
|
"\n",
|
|
"# Add time-based columns\n",
|
|
"df_business['year'] = df_business['date'].dt.year\n",
|
|
"df_business['month'] = df_business['date'].dt.month\n",
|
|
"df_business['quarter'] = df_business['date'].dt.quarter\n",
|
|
"df_business['day_of_week'] = df_business['date'].dt.day_name()\n",
|
|
"df_business['month_name'] = df_business['date'].dt.month_name()\n",
|
|
"\n",
|
|
"print(\"Business dataset created:\")\n",
|
|
"print(f\"Shape: {df_business.shape}\")\n",
|
|
"print(\"\\nFirst few rows:\")\n",
|
|
"print(df_business.head())\n",
|
|
"print(\"\\nColumn info:\")\n",
|
|
"print(df_business.dtypes)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 1. Basic Pivot Tables\n",
|
|
"\n",
|
|
"Creating fundamental pivot tables for data summarization."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 26,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"=== BASIC PIVOT TABLES ===\n",
|
|
"Sales by Region and Product Category:\n",
|
|
"product_category Books Clothing Electronics Home & Garden\n",
|
|
"region \n",
|
|
"East 12377.0 12466.0 16463.0 14140.0\n",
|
|
"North 13492.0 14938.0 15229.0 17770.0\n",
|
|
"South 17590.0 16946.0 14552.0 13695.0\n",
|
|
"West 8157.0 16544.0 12681.0 15652.0\n",
|
|
"\n",
|
|
"Average Order Value by Customer Type and Sales Channel:\n",
|
|
"sales_channel Online Phone Store\n",
|
|
"customer_type \n",
|
|
"New 263.07 244.46 222.16\n",
|
|
"Returning 246.03 220.45 224.39\n",
|
|
"VIP 234.35 237.16 253.16\n",
|
|
"\n",
|
|
"Transaction Count by Region and Customer Type:\n",
|
|
"customer_type New Returning VIP\n",
|
|
"region \n",
|
|
"East 60 128 60\n",
|
|
"North 70 144 53\n",
|
|
"South 68 138 45\n",
|
|
"West 61 119 54\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Simple pivot table - sales by region and product category\n",
|
|
"print(\"=== BASIC PIVOT TABLES ===\")\n",
|
|
"\n",
|
|
"# Basic pivot: sum of sales by region and product category\n",
|
|
"basic_pivot = df_business.pivot_table(\n",
|
|
" values='net_sales',\n",
|
|
" index='region',\n",
|
|
" columns='product_category',\n",
|
|
" aggfunc='sum',\n",
|
|
" fill_value=0\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"Sales by Region and Product Category:\")\n",
|
|
"print(basic_pivot.round(0))\n",
|
|
"\n",
|
|
"# Average order value by customer type and sales channel\n",
|
|
"avg_order_pivot = df_business.pivot_table(\n",
|
|
" values='total_order',\n",
|
|
" index='customer_type',\n",
|
|
" columns='sales_channel',\n",
|
|
" aggfunc='mean',\n",
|
|
" fill_value=0\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"\\nAverage Order Value by Customer Type and Sales Channel:\")\n",
|
|
"print(avg_order_pivot.round(2))\n",
|
|
"\n",
|
|
"# Count of transactions\n",
|
|
"transaction_count = df_business.pivot_table(\n",
|
|
" values='total_order',\n",
|
|
" index='region',\n",
|
|
" columns='customer_type',\n",
|
|
" aggfunc='count',\n",
|
|
" fill_value=0\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"\\nTransaction Count by Region and Customer Type:\")\n",
|
|
"print(transaction_count)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 27,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"=== MULTIPLE AGGREGATION FUNCTIONS ===\n",
|
|
"Multiple aggregations (sum, mean, count):\n",
|
|
" sum mean \\\n",
|
|
"product_category Books Clothing Electronics Home & Garden Books \n",
|
|
"region \n",
|
|
"East 12376.85 12465.78 16462.72 14139.75 217.14 \n",
|
|
"North 13491.69 14938.35 15228.66 17769.95 214.15 \n",
|
|
"South 17589.76 16945.54 14551.58 13695.40 266.51 \n",
|
|
"West 8156.64 16544.42 12680.58 15651.63 189.69 \n",
|
|
"\n",
|
|
" count \\\n",
|
|
"product_category Clothing Electronics Home & Garden Books Clothing \n",
|
|
"region \n",
|
|
"East 197.87 238.59 239.66 57 63 \n",
|
|
"North 226.34 220.71 257.54 63 66 \n",
|
|
"South 260.70 234.70 236.13 66 65 \n",
|
|
"West 239.77 218.63 244.56 43 69 \n",
|
|
"\n",
|
|
" \n",
|
|
"product_category Electronics Home & Garden \n",
|
|
"region \n",
|
|
"East 69 59 \n",
|
|
"North 69 69 \n",
|
|
"South 62 58 \n",
|
|
"West 58 64 \n",
|
|
"\n",
|
|
"Mixed aggregations for different metrics:\n",
|
|
" net_sales quantity total_order \\\n",
|
|
"sales_channel Online Phone Store Online Phone Store Online \n",
|
|
"region \n",
|
|
"East 34662.14 4164.23 16618.73 731 90 372 246.72 \n",
|
|
"North 36698.38 6597.29 18133.00 780 141 370 235.80 \n",
|
|
"South 38569.20 5475.73 18737.36 778 128 369 263.71 \n",
|
|
"West 38003.32 3745.70 11284.26 805 83 264 245.57 \n",
|
|
"\n",
|
|
" \n",
|
|
"sales_channel Phone Store \n",
|
|
"region \n",
|
|
"East 239.96 203.20 \n",
|
|
"North 243.26 240.63 \n",
|
|
"South 227.11 258.94 \n",
|
|
"West 204.47 213.05 \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Multiple aggregation functions in one pivot table\n",
|
|
"print(\"=== MULTIPLE AGGREGATION FUNCTIONS ===\")\n",
|
|
"\n",
|
|
"# Multiple aggregations for comprehensive analysis\n",
|
|
"multi_agg_pivot = df_business.pivot_table(\n",
|
|
" values='net_sales',\n",
|
|
" index='region',\n",
|
|
" columns='product_category',\n",
|
|
" aggfunc=['sum', 'mean', 'count'],\n",
|
|
" fill_value=0\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"Multiple aggregations (sum, mean, count):\")\n",
|
|
"print(multi_agg_pivot.round(2))\n",
|
|
"\n",
|
|
"# Different values with different aggregations\n",
|
|
"mixed_agg_pivot = df_business.pivot_table(\n",
|
|
" values=['net_sales', 'quantity', 'total_order'],\n",
|
|
" index='region',\n",
|
|
" columns='sales_channel',\n",
|
|
" aggfunc={\n",
|
|
" 'net_sales': 'sum',\n",
|
|
" 'quantity': 'sum',\n",
|
|
" 'total_order': 'mean'\n",
|
|
" },\n",
|
|
" fill_value=0\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"\\nMixed aggregations for different metrics:\")\n",
|
|
"print(mixed_agg_pivot.round(2))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 28,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"=== PIVOT TABLES WITH TOTALS ===\n",
|
|
"Pivot table with row and column totals:\n",
|
|
"product_category Books Clothing Electronics Home & Garden Total\n",
|
|
"region \n",
|
|
"East 12377.0 12466.0 16463.0 14140.0 55445.0\n",
|
|
"North 13492.0 14938.0 15229.0 17770.0 61429.0\n",
|
|
"South 17590.0 16946.0 14552.0 13695.0 62782.0\n",
|
|
"West 8157.0 16544.0 12681.0 15652.0 53033.0\n",
|
|
"Total 51615.0 60894.0 58924.0 61257.0 232689.0\n",
|
|
"\n",
|
|
"Sales distribution as percentages:\n",
|
|
"product_category Books Clothing Electronics Home & Garden Total\n",
|
|
"region \n",
|
|
"East 5.32 5.36 7.07 6.08 23.83\n",
|
|
"North 5.80 6.42 6.54 7.64 26.40\n",
|
|
"South 7.56 7.28 6.25 5.89 26.98\n",
|
|
"West 3.51 7.11 5.45 6.73 22.79\n",
|
|
"Total 22.18 26.17 25.32 26.33 100.00\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Pivot tables with totals and margins\n",
|
|
"print(\"=== PIVOT TABLES WITH TOTALS ===\")\n",
|
|
"\n",
|
|
"# Add margins (totals) to pivot table\n",
|
|
"pivot_with_totals = df_business.pivot_table(\n",
|
|
" values='net_sales',\n",
|
|
" index='region',\n",
|
|
" columns='product_category',\n",
|
|
" aggfunc='sum',\n",
|
|
" fill_value=0,\n",
|
|
" margins=True,\n",
|
|
" margins_name='Total'\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"Pivot table with row and column totals:\")\n",
|
|
"print(pivot_with_totals.round(0))\n",
|
|
"\n",
|
|
"# Calculate percentages of total\n",
|
|
"pivot_percentages = df_business.pivot_table(\n",
|
|
" values='net_sales',\n",
|
|
" index='region',\n",
|
|
" columns='product_category',\n",
|
|
" aggfunc='sum',\n",
|
|
" fill_value=0,\n",
|
|
" margins=True,\n",
|
|
" margins_name='Total'\n",
|
|
")\n",
|
|
"\n",
|
|
"# Convert to percentages (excluding totals row/column for calculation)\n",
|
|
"total_sales = pivot_percentages.loc['Total', 'Total']\n",
|
|
"pivot_pct = (pivot_percentages / total_sales * 100).round(2)\n",
|
|
"\n",
|
|
"print(\"\\nSales distribution as percentages:\")\n",
|
|
"print(pivot_pct)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 2. Advanced Pivot Table Techniques\n",
|
|
"\n",
|
|
"Complex pivot tables with multiple indices and custom aggregations."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"=== MULTI-LEVEL INDEX PIVOT TABLES ===\n",
|
|
"Hierarchical pivot (Region > Salesperson vs Product Category):\n",
|
|
"product_category Books Clothing Electronics Home & Garden\n",
|
|
"region salesperson \n",
|
|
"East Alice Johnson 1537.0605 838.5400 1792.4760 1693.2745\n",
|
|
" Bob Smith 2039.4035 1156.4895 1110.9565 950.9340\n",
|
|
" Charlie Brown 435.2560 952.0800 2610.3275 1098.6095\n",
|
|
" Diana Prince 1874.3990 1845.6785 1898.2705 1610.8440\n",
|
|
" Eve Wilson 1585.4275 1534.6530 1395.2180 1179.5475\n",
|
|
" Frank Miller 1106.1950 1476.6940 945.8550 988.5600\n",
|
|
" Grace Lee 1556.9905 1298.2700 865.6170 938.2240\n",
|
|
" Henry Davis 875.1640 725.9365 2336.0330 1322.1800\n",
|
|
" Ivy Chen 797.3240 1608.6365 1110.4660 1724.6260\n",
|
|
" Jack Robinson 569.6290 1028.8020 2397.5045 2632.9480\n",
|
|
"North Alice Johnson 2314.9745 2757.4270 1911.9930 1856.1875\n",
|
|
" Bob Smith 1031.4940 489.7395 801.7940 796.1610\n",
|
|
" Charlie Brown 498.9095 1879.5460 1790.1620 1393.1530\n",
|
|
" Diana Prince 214.3950 1484.9535 1053.2905 2467.8815\n",
|
|
" Eve Wilson 2040.3085 1654.9070 1178.6505 1498.5810\n",
|
|
" Frank Miller 2509.5095 1490.6560 1264.5290 766.4540\n",
|
|
" Grace Lee 748.8710 1293.0970 833.7205 2235.9345\n",
|
|
" Henry Davis 815.0200 1089.6130 1072.8980 1280.2690\n",
|
|
" Ivy Chen 2411.9690 1079.7175 1762.1170 1244.5645\n",
|
|
" Jack Robinson 906.2425 1718.6920 3559.5100 4230.7635\n",
|
|
"\n",
|
|
"Hierarchical columns (Product Category > Customer Type):\n",
|
|
"product_category Books Clothing \\\n",
|
|
"customer_type New Returning VIP New Returning VIP \n",
|
|
"region \n",
|
|
"East 2077.0 7770.0 2530.0 2680.0 6856.0 2929.0 \n",
|
|
"North 2546.0 7153.0 3792.0 2959.0 7514.0 4465.0 \n",
|
|
"South 2831.0 10393.0 4366.0 6472.0 8685.0 1788.0 \n",
|
|
"West 3128.0 2906.0 2123.0 4974.0 8570.0 3000.0 \n",
|
|
"\n",
|
|
"product_category Electronics Home & Garden \n",
|
|
"customer_type New Returning VIP New Returning VIP \n",
|
|
"region \n",
|
|
"East 4562.0 7630.0 4271.0 3974.0 7367.0 2799.0 \n",
|
|
"North 6068.0 6693.0 2467.0 4970.0 10655.0 2145.0 \n",
|
|
"South 4919.0 6629.0 3004.0 2979.0 7527.0 3189.0 \n",
|
|
"West 3013.0 7416.0 2252.0 4220.0 7502.0 3929.0 \n",
|
|
"\n",
|
|
"Full hierarchical pivot (limited sample):\n",
|
|
"product_category Books Clothing \n",
|
|
"sales_channel Online Phone Store Online Phone Store\n",
|
|
"region quarter \n",
|
|
"East 1 2693.0 261.0 1483.0 1873.0 573.0 2045.0\n",
|
|
" 2 1735.0 233.0 1766.0 4208.0 644.0 529.0\n",
|
|
" 3 1358.0 0.0 1284.0 867.0 0.0 225.0\n",
|
|
" 4 1036.0 0.0 527.0 998.0 0.0 503.0\n",
|
|
"North 1 2012.0 0.0 1019.0 1987.0 948.0 1759.0\n",
|
|
" 2 1806.0 375.0 2181.0 1836.0 471.0 499.0\n",
|
|
" 3 2210.0 348.0 806.0 2702.0 305.0 311.0\n",
|
|
" 4 1649.0 0.0 1086.0 2437.0 0.0 1682.0\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Multi-level index pivot tables\n",
|
|
"print(\"=== MULTI-LEVEL INDEX PIVOT TABLES ===\")\n",
|
|
"\n",
|
|
"# Hierarchical rows\n",
|
|
"hierarchical_pivot = df_business.pivot_table(\n",
|
|
" values='net_sales',\n",
|
|
" index=['region', 'salesperson'],\n",
|
|
" columns='product_category',\n",
|
|
" aggfunc='sum',\n",
|
|
" fill_value=0\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"Hierarchical pivot (Region > Salesperson vs Product Category):\")\n",
|
|
"print(hierarchical_pivot.head(20))\n",
|
|
"\n",
|
|
"# Hierarchical columns\n",
|
|
"hierarchical_cols_pivot = df_business.pivot_table(\n",
|
|
" values='net_sales',\n",
|
|
" index='region',\n",
|
|
" columns=['product_category', 'customer_type'],\n",
|
|
" aggfunc='sum',\n",
|
|
" fill_value=0\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"\\nHierarchical columns (Product Category > Customer Type):\")\n",
|
|
"print(hierarchical_cols_pivot.round(0))\n",
|
|
"\n",
|
|
"# Both hierarchical rows and columns\n",
|
|
"full_hierarchical = df_business.pivot_table(\n",
|
|
" values='net_sales',\n",
|
|
" index=['region', 'quarter'],\n",
|
|
" columns=['product_category', 'sales_channel'],\n",
|
|
" aggfunc='sum',\n",
|
|
" fill_value=0\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"\\nFull hierarchical pivot (limited sample):\")\n",
|
|
"print(full_hierarchical.iloc[:8, :6].round(0)) # Show subset for readability"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 30,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"=== CUSTOM AGGREGATION FUNCTIONS ===\n",
|
|
"Custom aggregations pivot table:\n",
|
|
" mean std \\\n",
|
|
"product_category Books Clothing Electronics Home & Garden Books Clothing \n",
|
|
"region \n",
|
|
"East 217.14 197.87 238.59 239.66 133.32 176.45 \n",
|
|
"North 214.15 226.34 220.71 257.54 144.81 157.82 \n",
|
|
"South 266.51 260.70 234.70 236.13 153.97 158.86 \n",
|
|
"West 189.69 239.77 218.63 244.56 125.47 163.81 \n",
|
|
"\n",
|
|
" coefficient_of_variation \\\n",
|
|
"product_category Electronics Home & Garden Books Clothing \n",
|
|
"region \n",
|
|
"East 154.31 150.89 0.61 0.89 \n",
|
|
"North 165.37 174.64 0.68 0.70 \n",
|
|
"South 175.74 165.01 0.58 0.61 \n",
|
|
"West 143.66 167.40 0.66 0.68 \n",
|
|
"\n",
|
|
" sales_range \\\n",
|
|
"product_category Electronics Home & Garden Books Clothing Electronics \n",
|
|
"region \n",
|
|
"East 0.65 0.63 648.70 758.44 697.50 \n",
|
|
"North 0.75 0.68 531.31 663.18 752.50 \n",
|
|
"South 0.75 0.70 634.55 641.48 773.92 \n",
|
|
"West 0.66 0.68 436.05 629.05 638.34 \n",
|
|
"\n",
|
|
" \n",
|
|
"product_category Home & Garden \n",
|
|
"region \n",
|
|
"East 583.40 \n",
|
|
"North 619.36 \n",
|
|
"South 659.43 \n",
|
|
"West 656.63 \n",
|
|
"\n",
|
|
"Lambda function aggregations:\n",
|
|
" <lambda_0> <lambda_1> <lambda_2> \\\n",
|
|
"sales_channel Online Phone Store Online Phone Store Online \n",
|
|
"region \n",
|
|
"East 330.66 337.74 265.63 59 9 38 109.56 \n",
|
|
"North 340.40 317.78 358.12 72 15 34 318.50 \n",
|
|
"South 375.57 293.52 359.79 66 11 35 156.12 \n",
|
|
"West 352.13 323.46 267.42 69 8 22 54.42 \n",
|
|
"\n",
|
|
" mean \n",
|
|
"sales_channel Phone Store Online Phone Store \n",
|
|
"region \n",
|
|
"East 16.17 168.84 239.05 231.35 195.51 \n",
|
|
"North 72.98 38.41 227.94 235.62 232.47 \n",
|
|
"South 42.03 14.63 255.43 219.03 249.83 \n",
|
|
"West 53.92 53.58 237.52 197.14 205.17 \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Custom aggregation functions\n",
|
|
"print(\"=== CUSTOM AGGREGATION FUNCTIONS ===\")\n",
|
|
"\n",
|
|
"# Define custom aggregation functions\n",
|
|
"def coefficient_of_variation(series):\n",
|
|
" \"\"\"Calculate coefficient of variation (std/mean)\"\"\"\n",
|
|
" return series.std() / series.mean() if series.mean() != 0 else 0\n",
|
|
"\n",
|
|
"def sales_range(series):\n",
|
|
" \"\"\"Calculate range (max - min)\"\"\"\n",
|
|
" return series.max() - series.min()\n",
|
|
"\n",
|
|
"def high_value_count(series, threshold=100):\n",
|
|
" \"\"\"Count values above threshold\"\"\"\n",
|
|
" return (series > threshold).sum()\n",
|
|
"\n",
|
|
"# Apply custom aggregations\n",
|
|
"custom_agg_pivot = df_business.pivot_table(\n",
|
|
" values='net_sales',\n",
|
|
" index='region',\n",
|
|
" columns='product_category',\n",
|
|
" aggfunc=[np.mean, np.std, coefficient_of_variation, sales_range],\n",
|
|
" fill_value=0\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"Custom aggregations pivot table:\")\n",
|
|
"print(custom_agg_pivot.round(2))\n",
|
|
"\n",
|
|
"# Lambda functions for inline custom aggregations\n",
|
|
"lambda_agg_pivot = df_business.pivot_table(\n",
|
|
" values='net_sales',\n",
|
|
" index='region',\n",
|
|
" columns='sales_channel',\n",
|
|
" aggfunc={\n",
|
|
" 'net_sales': [\n",
|
|
" 'mean',\n",
|
|
" lambda x: x.quantile(0.75), # 75th percentile\n",
|
|
" lambda x: (x > x.mean()).sum(), # Count above average\n",
|
|
" lambda x: x.max() / x.min() if x.min() > 0 else 0 # Max/Min ratio\n",
|
|
" ]\n",
|
|
" },\n",
|
|
" fill_value=0\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"\\nLambda function aggregations:\")\n",
|
|
"print(lambda_agg_pivot.round(2))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 31,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"=== TIME-BASED PIVOT TABLES ===\n",
|
|
"Monthly sales by product category:\n",
|
|
"product_category Books Clothing Electronics Home & Garden\n",
|
|
"month_name \n",
|
|
"January 4813.0 7034.0 4631.0 3365.0\n",
|
|
"February 4022.0 8492.0 3016.0 5593.0\n",
|
|
"March 6809.0 3461.0 5373.0 6898.0\n",
|
|
"April 5464.0 5000.0 5719.0 5014.0\n",
|
|
"May 5767.0 4023.0 5243.0 6586.0\n",
|
|
"June 4104.0 8360.0 7231.0 2890.0\n",
|
|
"July 5149.0 3183.0 6009.0 4328.0\n",
|
|
"August 3052.0 6048.0 6709.0 5114.0\n",
|
|
"September 4366.0 3026.0 4470.0 8162.0\n",
|
|
"October 3442.0 4719.0 2349.0 4465.0\n",
|
|
"November 2562.0 3316.0 3646.0 5111.0\n",
|
|
"December 2064.0 4231.0 4528.0 3729.0\n",
|
|
"\n",
|
|
"Day of week analysis:\n",
|
|
" net_sales quantity \n",
|
|
"sales_channel Online Phone Store Online Phone Store\n",
|
|
"day_of_week \n",
|
|
"Monday 18611.51 3866.50 10728.14 4.80 5.14 4.91\n",
|
|
"Tuesday 19606.58 3841.09 9615.84 4.90 5.75 4.68\n",
|
|
"Wednesday 22785.70 945.00 9312.78 5.14 4.60 5.00\n",
|
|
"Thursday 22420.93 5605.94 8475.84 5.35 5.75 4.89\n",
|
|
"Friday 21358.62 1274.59 11041.92 5.06 3.18 5.36\n",
|
|
"Saturday 24008.00 1851.37 6557.88 4.85 4.10 4.06\n",
|
|
"Sunday 19141.68 2598.45 9040.94 4.99 4.57 3.88\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Time-based pivot tables\n",
|
|
"print(\"=== TIME-BASED PIVOT TABLES ===\")\n",
|
|
"\n",
|
|
"# Monthly sales trends by product category\n",
|
|
"monthly_sales = df_business.pivot_table(\n",
|
|
" values='net_sales',\n",
|
|
" index='month_name',\n",
|
|
" columns='product_category',\n",
|
|
" aggfunc='sum',\n",
|
|
" fill_value=0\n",
|
|
")\n",
|
|
"\n",
|
|
"# Reorder months correctly\n",
|
|
"month_order = ['January', 'February', 'March', 'April', 'May', 'June',\n",
|
|
" 'July', 'August', 'September', 'October', 'November', 'December']\n",
|
|
"monthly_sales = monthly_sales.reindex([m for m in month_order if m in monthly_sales.index])\n",
|
|
"\n",
|
|
"print(\"Monthly sales by product category:\")\n",
|
|
"print(monthly_sales.round(0))\n",
|
|
"\n",
|
|
"# Day of week analysis\n",
|
|
"dow_analysis = df_business.pivot_table(\n",
|
|
" values=['net_sales', 'quantity'],\n",
|
|
" index='day_of_week',\n",
|
|
" columns='sales_channel',\n",
|
|
" aggfunc={\n",
|
|
" 'net_sales': 'sum',\n",
|
|
" 'quantity': 'mean'\n",
|
|
" },\n",
|
|
" fill_value=0\n",
|
|
")\n",
|
|
"\n",
|
|
"# Reorder days of week\n",
|
|
"day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']\n",
|
|
"dow_analysis = dow_analysis.reindex([d for d in day_order if d in dow_analysis.index])\n",
|
|
"\n",
|
|
"print(\"\\nDay of week analysis:\")\n",
|
|
"print(dow_analysis.round(2))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 3. Cross-tabulation and Contingency Tables\n",
|
|
"\n",
|
|
"Analyzing relationships between categorical variables."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 32,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"=== CROSS-TABULATION ===\n",
|
|
"Transaction count by Region and Customer Type:\n",
|
|
"customer_type New Returning VIP\n",
|
|
"region \n",
|
|
"East 60 128 60\n",
|
|
"North 70 144 53\n",
|
|
"South 68 138 45\n",
|
|
"West 61 119 54\n",
|
|
"\n",
|
|
"Total sales by Region and Customer Type:\n",
|
|
"customer_type New Returning VIP\n",
|
|
"region \n",
|
|
"East 13293.0 29623.0 12529.0\n",
|
|
"North 16544.0 32016.0 12869.0\n",
|
|
"South 17201.0 33234.0 12347.0\n",
|
|
"West 15336.0 26394.0 11304.0\n",
|
|
"\n",
|
|
"Transaction count by Product Category and Sales Channel (with totals):\n",
|
|
"sales_channel Online Phone Store Total\n",
|
|
"product_category \n",
|
|
"Books 140 15 74 229\n",
|
|
"Clothing 169 26 68 263\n",
|
|
"Electronics 166 21 71 258\n",
|
|
"Home & Garden 142 28 80 250\n",
|
|
"Total 617 90 293 1000\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Basic cross-tabulation\n",
|
|
"print(\"=== CROSS-TABULATION ===\")\n",
|
|
"\n",
|
|
"# Simple crosstab - count of transactions\n",
|
|
"basic_crosstab = pd.crosstab(df_business['region'], df_business['customer_type'])\n",
|
|
"print(\"Transaction count by Region and Customer Type:\")\n",
|
|
"print(basic_crosstab)\n",
|
|
"\n",
|
|
"# Crosstab with values (not just counts)\n",
|
|
"sales_crosstab = pd.crosstab(\n",
|
|
" df_business['region'],\n",
|
|
" df_business['customer_type'],\n",
|
|
" values=df_business['net_sales'],\n",
|
|
" aggfunc='sum'\n",
|
|
")\n",
|
|
"print(\"\\nTotal sales by Region and Customer Type:\")\n",
|
|
"print(sales_crosstab.round(0))\n",
|
|
"\n",
|
|
"# Crosstab with margins (totals)\n",
|
|
"crosstab_with_margins = pd.crosstab(\n",
|
|
" df_business['product_category'],\n",
|
|
" df_business['sales_channel'],\n",
|
|
" margins=True,\n",
|
|
" margins_name='Total'\n",
|
|
")\n",
|
|
"print(\"\\nTransaction count by Product Category and Sales Channel (with totals):\")\n",
|
|
"print(crosstab_with_margins)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 33,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"=== NORMALIZED CROSS-TABULATION ===\n",
|
|
"Row percentages (Customer type distribution within each region):\n",
|
|
"customer_type New Returning VIP\n",
|
|
"region \n",
|
|
"East 24.2 51.6 24.2\n",
|
|
"North 26.2 53.9 19.9\n",
|
|
"South 27.1 55.0 17.9\n",
|
|
"West 26.1 50.9 23.1\n",
|
|
"\n",
|
|
"Column percentages (Region distribution within each customer type):\n",
|
|
"customer_type New Returning VIP\n",
|
|
"region \n",
|
|
"East 23.2 24.2 28.3\n",
|
|
"North 27.0 27.2 25.0\n",
|
|
"South 26.3 26.1 21.2\n",
|
|
"West 23.6 22.5 25.5\n",
|
|
"\n",
|
|
"Total percentages (Percentage of overall total):\n",
|
|
"customer_type New Returning VIP\n",
|
|
"region \n",
|
|
"East 6.0 12.8 6.0\n",
|
|
"North 7.0 14.4 5.3\n",
|
|
"South 6.8 13.8 4.5\n",
|
|
"West 6.1 11.9 5.4\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Normalized cross-tabulation (percentages)\n",
|
|
"print(\"=== NORMALIZED CROSS-TABULATION ===\")\n",
|
|
"\n",
|
|
"# Normalize by rows (percentage of each row)\n",
|
|
"crosstab_row_pct = pd.crosstab(\n",
|
|
" df_business['region'],\n",
|
|
" df_business['customer_type'],\n",
|
|
" normalize='index' # Normalize by rows\n",
|
|
") * 100\n",
|
|
"\n",
|
|
"print(\"Row percentages (Customer type distribution within each region):\")\n",
|
|
"print(crosstab_row_pct.round(1))\n",
|
|
"\n",
|
|
"# Normalize by columns (percentage of each column)\n",
|
|
"crosstab_col_pct = pd.crosstab(\n",
|
|
" df_business['region'],\n",
|
|
" df_business['customer_type'],\n",
|
|
" normalize='columns' # Normalize by columns\n",
|
|
") * 100\n",
|
|
"\n",
|
|
"print(\"\\nColumn percentages (Region distribution within each customer type):\")\n",
|
|
"print(crosstab_col_pct.round(1))\n",
|
|
"\n",
|
|
"# Normalize by total (percentage of grand total)\n",
|
|
"crosstab_total_pct = pd.crosstab(\n",
|
|
" df_business['region'],\n",
|
|
" df_business['customer_type'],\n",
|
|
" normalize='all' # Normalize by total\n",
|
|
") * 100\n",
|
|
"\n",
|
|
"print(\"\\nTotal percentages (Percentage of overall total):\")\n",
|
|
"print(crosstab_total_pct.round(1))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 34,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"=== MULTI-DIMENSIONAL CROSS-TABULATION ===\n",
|
|
"Three-way analysis (Region & Product Category vs Customer Type):\n",
|
|
"customer_type New Returning VIP\n",
|
|
"region product_category \n",
|
|
"East Books 188.82 228.53 210.82\n",
|
|
" Clothing 206.15 221.17 154.18\n",
|
|
" Electronics 240.11 224.41 266.92\n",
|
|
" Home & Garden 233.78 254.03 215.29\n",
|
|
"North Books 254.64 183.41 270.87\n",
|
|
" Clothing 211.37 208.73 279.06\n",
|
|
" Electronics 233.40 230.79 176.23\n",
|
|
" Home & Garden 248.49 266.38 238.33\n",
|
|
"South Books 202.21 273.49 311.87\n",
|
|
" Clothing 340.66 234.73 198.67\n",
|
|
" Electronics 245.96 220.96 250.30\n",
|
|
" Home & Garden 198.59 228.10 318.92\n",
|
|
"West Books 223.46 193.70 151.62\n",
|
|
" Clothing 248.72 231.62 250.01\n",
|
|
" Electronics 251.07 211.88 204.71\n",
|
|
" Home & Garden 281.34 234.44 231.13\n",
|
|
"\n",
|
|
"Multiple aggregations crosstab:\n",
|
|
" count sum mean \\\n",
|
|
"sales_channel Online Phone Store Online Phone Store Online Phone \n",
|
|
"region \n",
|
|
"East 145 18 85 34662.14 4164.23 16618.73 239.05 231.35 \n",
|
|
"North 161 28 78 36698.38 6597.29 18133.00 227.94 235.62 \n",
|
|
"South 151 25 75 38569.20 5475.73 18737.36 255.43 219.03 \n",
|
|
"West 160 19 55 38003.32 3745.70 11284.26 237.52 197.14 \n",
|
|
"\n",
|
|
" \n",
|
|
"sales_channel Store \n",
|
|
"region \n",
|
|
"East 195.51 \n",
|
|
"North 232.47 \n",
|
|
"South 249.83 \n",
|
|
"West 205.17 \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Multi-dimensional cross-tabulation\n",
|
|
"print(\"=== MULTI-DIMENSIONAL CROSS-TABULATION ===\")\n",
|
|
"\n",
|
|
"# Three-way crosstab\n",
|
|
"three_way_crosstab = pd.crosstab(\n",
|
|
" [df_business['region'], df_business['product_category']],\n",
|
|
" df_business['customer_type'],\n",
|
|
" values=df_business['net_sales'],\n",
|
|
" aggfunc='mean'\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"Three-way analysis (Region & Product Category vs Customer Type):\")\n",
|
|
"print(three_way_crosstab.round(2))\n",
|
|
"\n",
|
|
"# Analysis with multiple aggregations\n",
|
|
"multi_agg_crosstab = pd.crosstab(\n",
|
|
" df_business['region'],\n",
|
|
" df_business['sales_channel'],\n",
|
|
" values=df_business['net_sales'],\n",
|
|
" aggfunc=['count', 'sum', 'mean']\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"\\nMultiple aggregations crosstab:\")\n",
|
|
"print(multi_agg_crosstab.round(2))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 4. Data Reshaping: Melt, Pivot, Stack, Unstack\n",
|
|
"\n",
|
|
"Transforming data between wide and long formats."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 35,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"=== MELTING DATA (WIDE TO LONG) ===\n",
|
|
"Wide format data:\n",
|
|
"product_category salesperson Books Clothing Electronics \\\n",
|
|
"0 Alice Johnson 7260.4360 7069.2950 8358.1560 \n",
|
|
"1 Bob Smith 4511.1555 4336.2970 3679.3625 \n",
|
|
"2 Charlie Brown 3677.6895 7830.9720 10264.1605 \n",
|
|
"3 Diana Prince 4861.2975 6275.8940 3348.5570 \n",
|
|
"4 Eve Wilson 6363.1345 7208.5765 5191.2705 \n",
|
|
"\n",
|
|
"product_category Home & Garden \n",
|
|
"0 5283.4600 \n",
|
|
"1 5164.6310 \n",
|
|
"2 4281.9240 \n",
|
|
"3 6108.1495 \n",
|
|
"4 6582.1870 \n",
|
|
"\n",
|
|
"Long format data (melted):\n",
|
|
" salesperson product_category total_sales\n",
|
|
"0 Alice Johnson Books 7260.4360\n",
|
|
"1 Bob Smith Books 4511.1555\n",
|
|
"2 Charlie Brown Books 3677.6895\n",
|
|
"3 Diana Prince Books 4861.2975\n",
|
|
"4 Eve Wilson Books 6363.1345\n",
|
|
"5 Frank Miller Books 5489.7555\n",
|
|
"6 Grace Lee Books 5945.8655\n",
|
|
"7 Henry Davis Books 5273.0950\n",
|
|
"8 Ivy Chen Books 4812.9940\n",
|
|
"9 Jack Robinson Books 3419.5195\n",
|
|
"\n",
|
|
"Complex wide format:\n",
|
|
"product_category region salesperson Books Clothing Electronics \\\n",
|
|
"0 East Alice Johnson 1537.0605 838.5400 1792.4760 \n",
|
|
"1 East Bob Smith 2039.4035 1156.4895 1110.9565 \n",
|
|
"2 East Charlie Brown 435.2560 952.0800 2610.3275 \n",
|
|
"3 East Diana Prince 1874.3990 1845.6785 1898.2705 \n",
|
|
"4 East Eve Wilson 1585.4275 1534.6530 1395.2180 \n",
|
|
"\n",
|
|
"product_category Home & Garden \n",
|
|
"0 1693.2745 \n",
|
|
"1 950.9340 \n",
|
|
"2 1098.6095 \n",
|
|
"3 1610.8440 \n",
|
|
"4 1179.5475 \n",
|
|
"\n",
|
|
"Complex long format:\n",
|
|
" region salesperson product_category total_sales\n",
|
|
"0 East Alice Johnson Books 1537.0605\n",
|
|
"1 East Bob Smith Books 2039.4035\n",
|
|
"2 East Charlie Brown Books 435.2560\n",
|
|
"3 East Diana Prince Books 1874.3990\n",
|
|
"4 East Eve Wilson Books 1585.4275\n",
|
|
"5 East Frank Miller Books 1106.1950\n",
|
|
"6 East Grace Lee Books 1556.9905\n",
|
|
"7 East Henry Davis Books 875.1640\n",
|
|
"8 East Ivy Chen Books 797.3240\n",
|
|
"9 East Jack Robinson Books 569.6290\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Melting data from wide to long format\n",
|
|
"print(\"=== MELTING DATA (WIDE TO LONG) ===\")\n",
|
|
"\n",
|
|
"# Create a wide format dataset first\n",
|
|
"wide_sales = df_business.pivot_table(\n",
|
|
" values='net_sales',\n",
|
|
" index='salesperson',\n",
|
|
" columns='product_category',\n",
|
|
" aggfunc='sum',\n",
|
|
" fill_value=0\n",
|
|
").reset_index()\n",
|
|
"\n",
|
|
"print(\"Wide format data:\")\n",
|
|
"print(wide_sales.head())\n",
|
|
"\n",
|
|
"# Melt to long format\n",
|
|
"long_sales = pd.melt(\n",
|
|
" wide_sales,\n",
|
|
" id_vars=['salesperson'],\n",
|
|
" var_name='product_category',\n",
|
|
" value_name='total_sales'\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"\\nLong format data (melted):\")\n",
|
|
"print(long_sales.head(10))\n",
|
|
"\n",
|
|
"# Melt with multiple ID variables\n",
|
|
"# First create a more complex wide dataset\n",
|
|
"complex_wide = df_business.groupby(['region', 'salesperson', 'product_category'])['net_sales'].sum().unstack(fill_value=0).reset_index()\n",
|
|
"\n",
|
|
"print(\"\\nComplex wide format:\")\n",
|
|
"print(complex_wide.head())\n",
|
|
"\n",
|
|
"# Melt with multiple ID vars\n",
|
|
"complex_long = pd.melt(\n",
|
|
" complex_wide,\n",
|
|
" id_vars=['region', 'salesperson'],\n",
|
|
" var_name='product_category',\n",
|
|
" value_name='total_sales'\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"\\nComplex long format:\")\n",
|
|
"print(complex_long.head(10))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 36,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"=== PIVOT OPERATION (LONG TO WIDE) ===\n",
|
|
"Long format data sample:\n",
|
|
" region product_category customer_type net_sales\n",
|
|
"0 North Clothing Returning 72.810\n",
|
|
"1 East Clothing Returning 764.847\n",
|
|
"2 West Books VIP 27.660\n",
|
|
"3 West Clothing New 252.080\n",
|
|
"4 East Clothing VIP 220.014\n",
|
|
"\n",
|
|
"Pivoted to wide format:\n",
|
|
"product_category Books Clothing Electronics Home & Garden\n",
|
|
"region \n",
|
|
"East 12377.0 12466.0 16463.0 14140.0\n",
|
|
"North 13492.0 14938.0 15229.0 17770.0\n",
|
|
"South 17590.0 16946.0 14552.0 13695.0\n",
|
|
"West 8157.0 16544.0 12681.0 15652.0\n",
|
|
"\n",
|
|
"Pivoted with reset index:\n",
|
|
"product_category region Books Clothing Electronics Home & Garden\n",
|
|
"0 East 12376.8490 12465.7800 16462.7240 14139.7475\n",
|
|
"1 North 13491.6935 14938.3485 15228.6645 17769.9495\n",
|
|
"2 South 17589.7640 16945.5435 14551.5775 13695.4025\n",
|
|
"3 West 8156.6360 16544.4195 12680.5820 15651.6335\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Pivot operation (long to wide)\n",
|
|
"print(\"=== PIVOT OPERATION (LONG TO WIDE) ===\")\n",
|
|
"\n",
|
|
"# Create long format data\n",
|
|
"long_data = df_business[['region', 'product_category', 'customer_type', 'net_sales']].copy()\n",
|
|
"print(\"Long format data sample:\")\n",
|
|
"print(long_data.head())\n",
|
|
"\n",
|
|
"# Simple pivot\n",
|
|
"pivoted_data = long_data.pivot_table(\n",
|
|
" values='net_sales',\n",
|
|
" index='region',\n",
|
|
" columns='product_category',\n",
|
|
" aggfunc='sum'\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"\\nPivoted to wide format:\")\n",
|
|
"print(pivoted_data.round(0))\n",
|
|
"\n",
|
|
"# Reset index to make it a regular DataFrame\n",
|
|
"pivoted_reset = pivoted_data.reset_index()\n",
|
|
"print(\"\\nPivoted with reset index:\")\n",
|
|
"print(pivoted_reset.head())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 37,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"=== STACK AND UNSTACK OPERATIONS ===\n",
|
|
"Multi-index DataFrame:\n",
|
|
"customer_type New Returning VIP\n",
|
|
"region product_category \n",
|
|
"East Books 2077.0530 7769.9840 2529.8120\n",
|
|
" Clothing 2679.9640 6856.4155 2929.4005\n",
|
|
" Electronics 4562.0515 7629.9035 4270.7690\n",
|
|
" Home & Garden 3974.1930 7366.8250 2798.7295\n",
|
|
"North Books 2546.4320 7153.0835 3792.1780\n",
|
|
" Clothing 2959.1455 7514.3230 4464.8800\n",
|
|
" Electronics 6068.4815 6692.9180 2467.2650\n",
|
|
" Home & Garden 4969.7755 10655.2095 2144.9645\n",
|
|
"South Books 2830.9150 10392.7060 4366.1430\n",
|
|
" Clothing 6472.4515 8685.0290 1788.0630\n",
|
|
"\n",
|
|
"After stacking (columns become rows):\n",
|
|
"region product_category customer_type\n",
|
|
"East Books New 2077.0530\n",
|
|
" Returning 7769.9840\n",
|
|
" VIP 2529.8120\n",
|
|
" Clothing New 2679.9640\n",
|
|
" Returning 6856.4155\n",
|
|
" VIP 2929.4005\n",
|
|
" Electronics New 4562.0515\n",
|
|
" Returning 7629.9035\n",
|
|
" VIP 4270.7690\n",
|
|
" Home & Garden New 3974.1930\n",
|
|
"dtype: float64\n",
|
|
"\n",
|
|
"After unstacking (back to original):\n",
|
|
"customer_type New Returning VIP\n",
|
|
"region product_category \n",
|
|
"East Books 2077.0530 7769.9840 2529.8120\n",
|
|
" Clothing 2679.9640 6856.4155 2929.4005\n",
|
|
" Electronics 4562.0515 7629.9035 4270.7690\n",
|
|
" Home & Garden 3974.1930 7366.8250 2798.7295\n",
|
|
"North Books 2546.4320 7153.0835 3792.1780\n",
|
|
" Clothing 2959.1455 7514.3230 4464.8800\n",
|
|
" Electronics 6068.4815 6692.9180 2467.2650\n",
|
|
" Home & Garden 4969.7755 10655.2095 2144.9645\n",
|
|
"South Books 2830.9150 10392.7060 4366.1430\n",
|
|
" Clothing 6472.4515 8685.0290 1788.0630\n",
|
|
"\n",
|
|
"Unstacking level 0 (region):\n",
|
|
"region East North South West\n",
|
|
"product_category customer_type \n",
|
|
"Books New 2077.0530 2546.4320 2830.9150 3128.4435\n",
|
|
" Returning 7769.9840 7153.0835 10392.7060 2905.5140\n",
|
|
" VIP 2529.8120 3792.1780 4366.1430 2122.6785\n",
|
|
"Clothing New 2679.9640 2959.1455 6472.4515 4974.4750\n",
|
|
" Returning 6856.4155 7514.3230 8685.0290 8569.8735\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Stack and Unstack operations\n",
|
|
"print(\"=== STACK AND UNSTACK OPERATIONS ===\")\n",
|
|
"\n",
|
|
"# Create a DataFrame with MultiIndex\n",
|
|
"multi_index_df = df_business.groupby(['region', 'product_category', 'customer_type'])['net_sales'].sum().reset_index()\n",
|
|
"multi_pivot = multi_index_df.pivot_table(\n",
|
|
" values='net_sales',\n",
|
|
" index=['region', 'product_category'],\n",
|
|
" columns='customer_type',\n",
|
|
" fill_value=0\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"Multi-index DataFrame:\")\n",
|
|
"print(multi_pivot.head(10))\n",
|
|
"\n",
|
|
"# Stack operation (columns to rows)\n",
|
|
"stacked = multi_pivot.stack()\n",
|
|
"print(\"\\nAfter stacking (columns become rows):\")\n",
|
|
"print(stacked.head(10))\n",
|
|
"\n",
|
|
"# Unstack operation (rows to columns)\n",
|
|
"unstacked = stacked.unstack()\n",
|
|
"print(\"\\nAfter unstacking (back to original):\")\n",
|
|
"print(unstacked.head(10))\n",
|
|
"\n",
|
|
"# Unstack different levels\n",
|
|
"unstacked_level0 = multi_pivot.stack().unstack(level=0)\n",
|
|
"print(\"\\nUnstacking level 0 (region):\")\n",
|
|
"print(unstacked_level0.head())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 5. Advanced Reshaping Techniques\n",
|
|
"\n",
|
|
"Complex data transformations for specialized analysis."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 38,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"=== MULTIPLE VALUE COLUMNS MELTING ===\n",
|
|
"Wide format with multiple metrics:\n",
|
|
" region product_category net_sales quantity total_order\n",
|
|
"0 East Books 12376.8490 277 225.021912\n",
|
|
"1 East Clothing 12465.7800 266 205.978571\n",
|
|
"2 East Electronics 16462.7240 349 245.975420\n",
|
|
"3 East Home & Garden 14139.7475 301 247.297246\n",
|
|
"4 North Books 13491.6935 296 221.647357\n",
|
|
"\n",
|
|
"Long format with multiple metrics:\n",
|
|
" region product_category metric value\n",
|
|
"0 East Books net_sales 12376.8490\n",
|
|
"1 East Clothing net_sales 12465.7800\n",
|
|
"2 East Electronics net_sales 16462.7240\n",
|
|
"3 East Home & Garden net_sales 14139.7475\n",
|
|
"4 North Books net_sales 13491.6935\n",
|
|
"5 North Clothing net_sales 14938.3485\n",
|
|
"6 North Electronics net_sales 15228.6645\n",
|
|
"7 North Home & Garden net_sales 17769.9495\n",
|
|
"8 South Books net_sales 17589.7640\n",
|
|
"9 South Clothing net_sales 16945.5435\n",
|
|
"10 South Electronics net_sales 14551.5775\n",
|
|
"11 South Home & Garden net_sales 13695.4025\n",
|
|
"12 West Books net_sales 8156.6360\n",
|
|
"13 West Clothing net_sales 16544.4195\n",
|
|
"14 West Electronics net_sales 12680.5820\n",
|
|
"\n",
|
|
"Reshaping: Region-Metric vs Product Category:\n",
|
|
"product_category Books Clothing Electronics Home & Garden\n",
|
|
"region metric \n",
|
|
"East net_sales 12376.85 12465.78 16462.72 14139.75\n",
|
|
" quantity 277.00 266.00 349.00 301.00\n",
|
|
" total_order 225.02 205.98 245.98 247.30\n",
|
|
"North net_sales 13491.69 14938.35 15228.66 17769.95\n",
|
|
" quantity 296.00 313.00 320.00 362.00\n",
|
|
" total_order 221.65 234.40 228.86 265.49\n",
|
|
"South net_sales 17589.76 16945.54 14551.58 13695.40\n",
|
|
" quantity 356.00 327.00 301.00 291.00\n",
|
|
" total_order 274.84 269.23 243.38 244.65\n",
|
|
"West net_sales 8156.64 16544.42 12680.58 15651.63\n",
|
|
" quantity 217.00 365.00 271.00 299.00\n",
|
|
" total_order 198.11 246.91 226.43 253.21\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Wide to long with multiple value columns\n",
|
|
"print(\"=== MULTIPLE VALUE COLUMNS MELTING ===\")\n",
|
|
"\n",
|
|
"# Create dataset with multiple metrics\n",
|
|
"metrics_wide = df_business.groupby(['region', 'product_category']).agg({\n",
|
|
" 'net_sales': 'sum',\n",
|
|
" 'quantity': 'sum',\n",
|
|
" 'total_order': 'mean'\n",
|
|
"}).reset_index()\n",
|
|
"\n",
|
|
"print(\"Wide format with multiple metrics:\")\n",
|
|
"print(metrics_wide.head())\n",
|
|
"\n",
|
|
"# Melt multiple value columns\n",
|
|
"metrics_long = pd.melt(\n",
|
|
" metrics_wide,\n",
|
|
" id_vars=['region', 'product_category'],\n",
|
|
" value_vars=['net_sales', 'quantity', 'total_order'],\n",
|
|
" var_name='metric',\n",
|
|
" value_name='value'\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"\\nLong format with multiple metrics:\")\n",
|
|
"print(metrics_long.head(15))\n",
|
|
"\n",
|
|
"# Alternative: melt and then pivot for different structure\n",
|
|
"metrics_pivot = metrics_long.pivot_table(\n",
|
|
" values='value',\n",
|
|
" index=['region', 'metric'],\n",
|
|
" columns='product_category',\n",
|
|
" fill_value=0\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"\\nReshaping: Region-Metric vs Product Category:\")\n",
|
|
"print(metrics_pivot.round(2))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 39,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"=== TIME SERIES PIVOT TABLES ===\n",
|
|
"Daily sales pivot (first 10 days):\n",
|
|
"product_category Books Clothing Electronics Home & Garden\n",
|
|
"date \n",
|
|
"2024-01-01 NaN 73.0 NaN NaN\n",
|
|
"2024-01-02 NaN 765.0 NaN NaN\n",
|
|
"2024-01-03 28.0 NaN NaN NaN\n",
|
|
"2024-01-04 NaN 252.0 NaN NaN\n",
|
|
"2024-01-05 NaN 220.0 NaN NaN\n",
|
|
"2024-01-06 112.0 NaN NaN NaN\n",
|
|
"2024-01-07 NaN NaN NaN 172.0\n",
|
|
"2024-01-08 NaN NaN 201.0 NaN\n",
|
|
"2024-01-09 NaN NaN NaN 153.0\n",
|
|
"2024-01-10 24.0 NaN NaN NaN\n",
|
|
"\n",
|
|
"7-day rolling average (sample):\n",
|
|
"product_category Books Clothing Electronics Home & Garden\n",
|
|
"date \n",
|
|
"2026-09-22 32.15 0.0 60.29 191.03\n",
|
|
"2026-09-23 32.15 0.0 60.29 184.35\n",
|
|
"2026-09-24 18.09 0.0 84.62 184.35\n",
|
|
"2026-09-25 57.20 0.0 84.62 89.29\n",
|
|
"2026-09-26 57.20 0.0 113.45 89.29\n",
|
|
"\n",
|
|
"Monthly sales:\n",
|
|
"product_category Books Clothing Electronics Home & Garden\n",
|
|
"period \n",
|
|
"2024-01 1393.0 2789.0 983.0 1657.0\n",
|
|
"2024-02 2322.0 2198.0 877.0 2445.0\n",
|
|
"2024-03 2665.0 1029.0 1393.0 1555.0\n",
|
|
"2024-04 1722.0 1728.0 2782.0 2171.0\n",
|
|
"2024-05 2191.0 1795.0 1068.0 2092.0\n",
|
|
"... ... ... ... ...\n",
|
|
"2026-05 1651.0 1084.0 2314.0 2334.0\n",
|
|
"2026-06 2077.0 2676.0 1940.0 261.0\n",
|
|
"2026-07 1445.0 811.0 2807.0 1512.0\n",
|
|
"2026-08 104.0 2701.0 2219.0 1606.0\n",
|
|
"2026-09 1570.0 285.0 1295.0 3671.0\n",
|
|
"\n",
|
|
"[33 rows x 4 columns]\n",
|
|
"\n",
|
|
"Month-over-month growth (%):\n",
|
|
"product_category Books Clothing Electronics Home & Garden\n",
|
|
"period \n",
|
|
"2024-01 NaN NaN NaN NaN\n",
|
|
"2024-02 66.7 -21.2 -10.7 47.5\n",
|
|
"2024-03 14.8 -53.2 58.8 -36.4\n",
|
|
"2024-04 -35.4 67.9 99.7 39.6\n",
|
|
"2024-05 27.2 3.9 -61.6 -3.6\n",
|
|
"... ... ... ... ...\n",
|
|
"2026-05 -49.0 -24.0 88.6 203.8\n",
|
|
"2026-06 25.8 146.9 -16.2 -88.8\n",
|
|
"2026-07 -30.4 -69.7 44.7 479.7\n",
|
|
"2026-08 -92.8 233.0 -21.0 6.2\n",
|
|
"2026-09 1403.9 -89.4 -41.6 128.5\n",
|
|
"\n",
|
|
"[33 rows x 4 columns]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Creating time series pivot tables\n",
|
|
"print(\"=== TIME SERIES PIVOT TABLES ===\")\n",
|
|
"\n",
|
|
"# Daily sales by product category\n",
|
|
"daily_sales = df_business.groupby(['date', 'product_category'])['net_sales'].sum().reset_index()\n",
|
|
"daily_pivot = daily_sales.pivot(index='date', columns='product_category', values='net_sales')\n",
|
|
"\n",
|
|
"print(\"Daily sales pivot (first 10 days):\")\n",
|
|
"print(daily_pivot.head(10).round(0))\n",
|
|
"\n",
|
|
"# Fill missing values and calculate rolling averages\n",
|
|
"daily_pivot_filled = daily_pivot.fillna(0)\n",
|
|
"rolling_avg = daily_pivot_filled.rolling(window=7).mean()\n",
|
|
"\n",
|
|
"print(\"\\n7-day rolling average (sample):\")\n",
|
|
"print(rolling_avg.tail(5).round(2))\n",
|
|
"\n",
|
|
"# Month-over-month growth\n",
|
|
"monthly_sales = df_business.groupby(['year', 'month', 'product_category'])['net_sales'].sum().reset_index()\n",
|
|
"monthly_sales['period'] = monthly_sales['year'].astype(str) + '-' + monthly_sales['month'].astype(str).str.zfill(2)\n",
|
|
"monthly_pivot = monthly_sales.pivot(index='period', columns='product_category', values='net_sales')\n",
|
|
"\n",
|
|
"print(\"\\nMonthly sales:\")\n",
|
|
"print(monthly_pivot.round(0))\n",
|
|
"\n",
|
|
"# Calculate month-over-month growth\n",
|
|
"mom_growth = monthly_pivot.pct_change() * 100\n",
|
|
"print(\"\\nMonth-over-month growth (%):\")\n",
|
|
"print(mom_growth.round(1))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 40,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"=== COMPLEX MULTI-LEVEL RESHAPING ===\n",
|
|
"Complex hierarchical data:\n",
|
|
" region salesperson quarter product_category net_sales quantity\n",
|
|
"0 East Alice Johnson 1 Books 710.9865 14\n",
|
|
"1 East Alice Johnson 1 Home & Garden 428.9600 8\n",
|
|
"2 East Alice Johnson 2 Books 240.5520 5\n",
|
|
"3 East Alice Johnson 2 Clothing 436.8720 7\n",
|
|
"4 East Alice Johnson 2 Electronics 750.9260 20\n",
|
|
"5 East Alice Johnson 2 Home & Garden 96.1745 2\n",
|
|
"6 East Alice Johnson 3 Books 223.0520 3\n",
|
|
"7 East Alice Johnson 3 Clothing 128.8600 2\n",
|
|
"8 East Alice Johnson 3 Electronics 567.8100 9\n",
|
|
"9 East Alice Johnson 3 Home & Garden 655.3200 12\n",
|
|
"\n",
|
|
"Salesperson performance by quarter and category:\n",
|
|
"product_category Books Clothing Electronics Home & Garden\n",
|
|
"region salesperson quarter \n",
|
|
"East Alice Johnson 1 710.9865 0.0000 0.0000 428.9600\n",
|
|
" 2 240.5520 436.8720 750.9260 96.1745\n",
|
|
" 3 223.0520 128.8600 567.8100 655.3200\n",
|
|
" 4 362.4700 272.8080 473.7400 512.8200\n",
|
|
" Bob Smith 1 775.7800 360.9700 0.0000 698.4970\n",
|
|
" 2 729.5005 561.8205 305.0745 10.1970\n",
|
|
" 3 534.1230 233.6990 0.0000 0.0000\n",
|
|
" 4 0.0000 0.0000 805.8820 242.2400\n",
|
|
" Charlie Brown 1 0.0000 675.4200 672.7680 220.9600\n",
|
|
" 2 141.1830 97.4520 1063.2360 689.1900\n",
|
|
" 3 233.1210 0.0000 455.7235 31.8835\n",
|
|
" 4 60.9520 179.2080 418.6000 156.5760\n",
|
|
" Diana Prince 1 294.5120 1280.4690 542.1370 58.3920\n",
|
|
" 2 1364.7370 244.2495 183.4500 1018.4200\n",
|
|
" 3 215.1500 320.9600 610.4835 266.0820\n",
|
|
"\n",
|
|
"Quarterly comparison view (sample):\n",
|
|
"product_category Books Clothing \\\n",
|
|
"quarter 1 2 3 4 1 2 3 \n",
|
|
"region salesperson \n",
|
|
"East Alice Johnson 711.0 241.0 223.0 362.0 0.0 437.0 129.0 \n",
|
|
" Bob Smith 776.0 730.0 534.0 0.0 361.0 562.0 234.0 \n",
|
|
" Charlie Brown 0.0 141.0 233.0 61.0 675.0 97.0 0.0 \n",
|
|
" Diana Prince 295.0 1365.0 215.0 0.0 1280.0 244.0 321.0 \n",
|
|
" Eve Wilson 0.0 501.0 281.0 803.0 505.0 942.0 88.0 \n",
|
|
"\n",
|
|
"product_category \n",
|
|
"quarter 4 \n",
|
|
"region salesperson \n",
|
|
"East Alice Johnson 273.0 \n",
|
|
" Bob Smith 0.0 \n",
|
|
" Charlie Brown 179.0 \n",
|
|
" Diana Prince 0.0 \n",
|
|
" Eve Wilson 0.0 \n",
|
|
"\n",
|
|
"Region summary:\n",
|
|
"product_category Books Clothing Electronics Home & Garden\n",
|
|
"region \n",
|
|
"East 12377.0 12466.0 16463.0 14140.0\n",
|
|
"North 13492.0 14938.0 15229.0 17770.0\n",
|
|
"South 17590.0 16946.0 14552.0 13695.0\n",
|
|
"West 8157.0 16544.0 12681.0 15652.0\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Complex multi-level reshaping\n",
|
|
"print(\"=== COMPLEX MULTI-LEVEL RESHAPING ===\")\n",
|
|
"\n",
|
|
"# Create complex hierarchical data\n",
|
|
"complex_data = df_business.groupby(['region', 'salesperson', 'quarter', 'product_category']).agg({\n",
|
|
" 'net_sales': 'sum',\n",
|
|
" 'quantity': 'sum'\n",
|
|
"}).reset_index()\n",
|
|
"\n",
|
|
"print(\"Complex hierarchical data:\")\n",
|
|
"print(complex_data.head(10))\n",
|
|
"\n",
|
|
"# Multiple pivot operations\n",
|
|
"# First pivot: Quarter vs Product Category for each salesperson\n",
|
|
"salesperson_pivot = complex_data.pivot_table(\n",
|
|
" values='net_sales',\n",
|
|
" index=['region', 'salesperson', 'quarter'],\n",
|
|
" columns='product_category',\n",
|
|
" fill_value=0\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"\\nSalesperson performance by quarter and category:\")\n",
|
|
"print(salesperson_pivot.head(15))\n",
|
|
"\n",
|
|
"# Stack and unstack for different views\n",
|
|
"# Unstack quarter to see quarterly comparison\n",
|
|
"quarterly_comparison = salesperson_pivot.unstack(level=2)\n",
|
|
"\n",
|
|
"print(\"\\nQuarterly comparison view (sample):\")\n",
|
|
"print(quarterly_comparison.iloc[:5, :8].round(0)) # Show subset\n",
|
|
"\n",
|
|
"# Create summary by region\n",
|
|
"region_summary = salesperson_pivot.groupby('region').sum()\n",
|
|
"print(\"\\nRegion summary:\")\n",
|
|
"print(region_summary.round(0))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 6. Business Intelligence Applications\n",
|
|
"\n",
|
|
"Real-world business analysis using pivot tables and reshaping."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 41,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"=== SALES PERFORMANCE DASHBOARD ===\n",
|
|
"1. Regional Performance Summary:\n",
|
|
" net_sales quantity total_order\n",
|
|
" mean sum sum count\n",
|
|
"region \n",
|
|
"East 223.57 55445.10 1193 248\n",
|
|
"North 230.07 61428.66 1291 267\n",
|
|
"South 250.13 62782.29 1275 251\n",
|
|
"West 226.64 53033.27 1152 234\n",
|
|
"\n",
|
|
"2. Category Performance by Quarter:\n",
|
|
"quarter 1 2 3 4 All\n",
|
|
"product_category \n",
|
|
"Books 15644.0 15334.0 12567.0 8069.0 51615.0\n",
|
|
"Clothing 18988.0 17384.0 12257.0 12266.0 60894.0\n",
|
|
"Electronics 13021.0 18192.0 17188.0 10523.0 58924.0\n",
|
|
"Home & Garden 15857.0 14490.0 17604.0 13305.0 61257.0\n",
|
|
"All 63509.0 65400.0 59617.0 44163.0 232689.0\n",
|
|
"\n",
|
|
"3. Sales Channel Analysis:\n",
|
|
" sum mean \n",
|
|
"customer_type New Returning VIP New Returning VIP\n",
|
|
"sales_channel \n",
|
|
"Online 39805.26 76878.78 31248.99 255.16 238.01 226.44\n",
|
|
"Phone 5438.57 10409.81 4134.56 236.46 212.45 229.70\n",
|
|
"Store 17130.46 33977.55 13665.33 214.13 216.42 244.02\n",
|
|
"\n",
|
|
"4. Top 5 Performers:\n",
|
|
" net_sales quantity total_order\n",
|
|
"salesperson \n",
|
|
"Alice Johnson 27971.35 550 118\n",
|
|
"Jack Robinson 27955.98 564 107\n",
|
|
"Charlie Brown 26054.75 529 110\n",
|
|
"Eve Wilson 25345.17 553 107\n",
|
|
"Frank Miller 24301.05 491 96\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Sales performance dashboard\n",
|
|
"print(\"=== SALES PERFORMANCE DASHBOARD ===\")\n",
|
|
"\n",
|
|
"def create_sales_dashboard(df):\n",
|
|
" \"\"\"Create comprehensive sales dashboard using pivot tables\"\"\"\n",
|
|
" dashboard = {}\n",
|
|
" \n",
|
|
" # 1. Regional performance summary\n",
|
|
" dashboard['regional_summary'] = df.pivot_table(\n",
|
|
" values=['net_sales', 'quantity', 'total_order'],\n",
|
|
" index='region',\n",
|
|
" aggfunc={\n",
|
|
" 'net_sales': ['sum', 'mean'],\n",
|
|
" 'quantity': 'sum',\n",
|
|
" 'total_order': 'count'\n",
|
|
" }\n",
|
|
" ).round(2)\n",
|
|
" \n",
|
|
" # 2. Product category performance\n",
|
|
" dashboard['category_performance'] = df.pivot_table(\n",
|
|
" values='net_sales',\n",
|
|
" index='product_category',\n",
|
|
" columns='quarter',\n",
|
|
" aggfunc='sum',\n",
|
|
" margins=True,\n",
|
|
" fill_value=0\n",
|
|
" ).round(0)\n",
|
|
" \n",
|
|
" # 3. Sales channel analysis\n",
|
|
" dashboard['channel_analysis'] = df.pivot_table(\n",
|
|
" values='net_sales',\n",
|
|
" index='sales_channel',\n",
|
|
" columns='customer_type',\n",
|
|
" aggfunc=['sum', 'mean'],\n",
|
|
" fill_value=0\n",
|
|
" ).round(2)\n",
|
|
" \n",
|
|
" # 4. Top performers\n",
|
|
" salesperson_performance = df.groupby('salesperson').agg({\n",
|
|
" 'net_sales': 'sum',\n",
|
|
" 'quantity': 'sum',\n",
|
|
" 'total_order': 'count'\n",
|
|
" }).round(2)\n",
|
|
" dashboard['top_performers'] = salesperson_performance.sort_values('net_sales', ascending=False).head(5)\n",
|
|
" \n",
|
|
" # 5. Monthly trends\n",
|
|
" dashboard['monthly_trends'] = df.pivot_table(\n",
|
|
" values='net_sales',\n",
|
|
" index='month_name',\n",
|
|
" columns='product_category',\n",
|
|
" aggfunc='sum',\n",
|
|
" fill_value=0\n",
|
|
" ).round(0)\n",
|
|
" \n",
|
|
" return dashboard\n",
|
|
"\n",
|
|
"# Generate dashboard\n",
|
|
"sales_dashboard = create_sales_dashboard(df_business)\n",
|
|
"\n",
|
|
"print(\"1. Regional Performance Summary:\")\n",
|
|
"print(sales_dashboard['regional_summary'])\n",
|
|
"\n",
|
|
"print(\"\\n2. Category Performance by Quarter:\")\n",
|
|
"print(sales_dashboard['category_performance'])\n",
|
|
"\n",
|
|
"print(\"\\n3. Sales Channel Analysis:\")\n",
|
|
"print(sales_dashboard['channel_analysis'])\n",
|
|
"\n",
|
|
"print(\"\\n4. Top 5 Performers:\")\n",
|
|
"print(sales_dashboard['top_performers'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 42,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"=== CUSTOMER SEGMENTATION ANALYSIS ===\n",
|
|
"Customer behavior by type and channel:\n",
|
|
" discount_amount net_sales \\\n",
|
|
"sales_channel Online Phone Store Online Phone Store \n",
|
|
"customer_type \n",
|
|
"New 11.73 15.24 10.61 255.16 236.46 214.13 \n",
|
|
"Returning 11.45 9.43 11.81 238.01 212.45 216.42 \n",
|
|
"VIP 11.36 15.36 15.59 226.44 229.70 244.02 \n",
|
|
"\n",
|
|
" quantity \n",
|
|
"sales_channel Online Phone Store \n",
|
|
"customer_type \n",
|
|
"New 5.29 4.78 4.47 \n",
|
|
"Returning 4.88 5.14 4.62 \n",
|
|
"VIP 5.03 4.44 5.21 \n",
|
|
"\n",
|
|
"Purchase patterns (% of quarterly sales):\n",
|
|
"quarter 1 2 3 4\n",
|
|
"customer_type product_category \n",
|
|
"New Books 6.2 4.4 3.1 4.3\n",
|
|
" Clothing 6.9 7.2 5.7 10.5\n",
|
|
" Electronics 4.3 9.5 9.7 8.6\n",
|
|
" Home & Garden 8.8 3.7 4.5 12.4\n",
|
|
"Returning Books 13.2 13.8 9.8 11.3\n",
|
|
" Clothing 16.2 12.8 12.3 12.7\n",
|
|
" Electronics 11.2 13.8 11.9 11.6\n",
|
|
" Home & Garden 10.2 15.3 17.7 13.7\n",
|
|
"VIP Books 5.2 5.3 8.1 2.7\n",
|
|
" Clothing 6.8 6.5 2.6 4.6\n",
|
|
" Electronics 4.9 4.5 7.2 3.6\n",
|
|
" Home & Garden 6.1 3.2 7.3 4.1\n",
|
|
"\n",
|
|
"Customer value distribution:\n",
|
|
" count mean std min 25% 50% 75% max\n",
|
|
"customer_type \n",
|
|
"New 259.0 248.78 155.20 18.31 128.74 217.77 345.09 768.32\n",
|
|
"Returning 529.0 237.24 160.52 9.76 109.41 201.91 342.29 798.01\n",
|
|
"VIP 212.0 239.56 158.12 25.92 115.17 198.15 357.37 673.39\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Customer segmentation analysis\n",
|
|
"print(\"=== CUSTOMER SEGMENTATION ANALYSIS ===\")\n",
|
|
"\n",
|
|
"# Customer behavior analysis\n",
|
|
"customer_behavior = df_business.pivot_table(\n",
|
|
" values=['net_sales', 'quantity', 'discount_amount'],\n",
|
|
" index='customer_type',\n",
|
|
" columns='sales_channel',\n",
|
|
" aggfunc={\n",
|
|
" 'net_sales': 'mean',\n",
|
|
" 'quantity': 'mean',\n",
|
|
" 'discount_amount': 'mean'\n",
|
|
" },\n",
|
|
" fill_value=0\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"Customer behavior by type and channel:\")\n",
|
|
"print(customer_behavior.round(2))\n",
|
|
"\n",
|
|
"# Purchase patterns analysis\n",
|
|
"purchase_patterns = pd.crosstab(\n",
|
|
" [df_business['customer_type'], df_business['product_category']],\n",
|
|
" df_business['quarter'],\n",
|
|
" values=df_business['net_sales'],\n",
|
|
" aggfunc='sum',\n",
|
|
" normalize='columns'\n",
|
|
") * 100\n",
|
|
"\n",
|
|
"print(\"\\nPurchase patterns (% of quarterly sales):\")\n",
|
|
"print(purchase_patterns.round(1))\n",
|
|
"\n",
|
|
"# Customer value distribution\n",
|
|
"value_distribution = df_business.groupby('customer_type')['total_order'].describe()\n",
|
|
"print(\"\\nCustomer value distribution:\")\n",
|
|
"print(value_distribution.round(2))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 43,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"=== COHORT ANALYSIS ===\n",
|
|
"Cohort analysis (average sales by quarters since first purchase):\n",
|
|
"periods_since_first 0 1 2 3\n",
|
|
"first_quarter \n",
|
|
"1 234.0 240.0 219.0 240.0\n",
|
|
"\n",
|
|
"Cohort retention (number of active salespeople):\n",
|
|
"periods_since_first 0 1 2 3\n",
|
|
"first_quarter \n",
|
|
"1 271 273 272 184\n",
|
|
"\n",
|
|
"Retention rates (%):\n",
|
|
"periods_since_first 0 1 2 3\n",
|
|
"first_quarter \n",
|
|
"1 100.0 100.7 100.4 67.9\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Cohort analysis using pivot tables\n",
|
|
"print(\"=== COHORT ANALYSIS ===\")\n",
|
|
"\n",
|
|
"# Simplified cohort analysis by quarter\n",
|
|
"# Group customers by their first purchase quarter\n",
|
|
"customer_first_purchase = df_business.groupby('salesperson')['quarter'].min().reset_index()\n",
|
|
"customer_first_purchase.columns = ['salesperson', 'first_quarter']\n",
|
|
"\n",
|
|
"# Merge back to get cohort information\n",
|
|
"cohort_data = df_business.merge(customer_first_purchase, on='salesperson')\n",
|
|
"cohort_data['periods_since_first'] = cohort_data['quarter'] - cohort_data['first_quarter']\n",
|
|
"\n",
|
|
"# Create cohort table\n",
|
|
"cohort_table = cohort_data.pivot_table(\n",
|
|
" values='net_sales',\n",
|
|
" index='first_quarter',\n",
|
|
" columns='periods_since_first',\n",
|
|
" aggfunc='mean',\n",
|
|
" fill_value=0\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"Cohort analysis (average sales by quarters since first purchase):\")\n",
|
|
"print(cohort_table.round(0))\n",
|
|
"\n",
|
|
"# Retention analysis\n",
|
|
"cohort_counts = cohort_data.pivot_table(\n",
|
|
" values='salesperson',\n",
|
|
" index='first_quarter',\n",
|
|
" columns='periods_since_first',\n",
|
|
" aggfunc='count',\n",
|
|
" fill_value=0\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"\\nCohort retention (number of active salespeople):\")\n",
|
|
"print(cohort_counts)\n",
|
|
"\n",
|
|
"# Calculate retention rates\n",
|
|
"cohort_retention = cohort_counts.divide(cohort_counts.iloc[:, 0], axis=0) * 100\n",
|
|
"print(\"\\nRetention rates (%):\")\n",
|
|
"print(cohort_retention.round(1))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Practice Exercises\n",
|
|
"\n",
|
|
"Apply pivot tables and reshaping to complex business scenarios:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 44,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Exercise 1: Multi-dimensional Business Intelligence Report\n",
|
|
"# Create a comprehensive BI report that includes:\n",
|
|
"# - Sales performance across multiple dimensions\n",
|
|
"# - Trend analysis with period-over-period comparisons\n",
|
|
"# - Customer segmentation insights\n",
|
|
"# - Product performance matrix\n",
|
|
"# - Actionable recommendations based on pivot table insights\n",
|
|
"\n",
|
|
"def create_comprehensive_bi_report(df):\n",
|
|
" \"\"\"Create multi-dimensional business intelligence report\"\"\"\n",
|
|
" # Your implementation here\n",
|
|
" pass\n",
|
|
"\n",
|
|
"# bi_report = create_comprehensive_bi_report(df_business)\n",
|
|
"# print(\"Comprehensive BI Report:\")\n",
|
|
"# for section, data in bi_report.items():\n",
|
|
"# print(f\"\\n{section}:\")\n",
|
|
"# print(data)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 45,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Exercise 2: Dynamic Pivot Table Generator\n",
|
|
"# Create a flexible function that can generate pivot tables with:\n",
|
|
"# - User-specified dimensions (rows, columns, values)\n",
|
|
"# - Multiple aggregation functions\n",
|
|
"# - Automatic handling of different data types\n",
|
|
"# - Export capabilities for different formats\n",
|
|
"\n",
|
|
"# Your code here:\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 46,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Exercise 3: Advanced Reshaping Challenge\n",
|
|
"# Transform the data through multiple reshaping operations:\n",
|
|
"# - Convert to time series format\n",
|
|
"# - Create rolling calculations\n",
|
|
"# - Build comparison matrices\n",
|
|
"# - Generate variance analysis reports\n",
|
|
"\n",
|
|
"# Your code here:\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Key Takeaways\n",
|
|
"\n",
|
|
"1. **Pivot Tables**:\n",
|
|
" - **`pivot_table()`**: Most flexible, handles duplicates with aggregation\n",
|
|
" - **`pivot()`**: Simple reshaping, requires unique index-column combinations\n",
|
|
" - **`crosstab()`**: Specialized for frequency tables and cross-tabulation\n",
|
|
"\n",
|
|
"2. **Reshaping Operations**:\n",
|
|
" - **`melt()`**: Wide to long format (unpivot)\n",
|
|
" - **`pivot()`**: Long to wide format\n",
|
|
" - **`stack()`**: Column to row index\n",
|
|
" - **`unstack()`**: Row index to column\n",
|
|
"\n",
|
|
"3. **Best Practices**:\n",
|
|
" - Use `fill_value=0` to handle missing combinations\n",
|
|
" - Add `margins=True` for totals when needed\n",
|
|
" - Choose appropriate aggregation functions for your data\n",
|
|
" - Consider data types when reshaping\n",
|
|
"\n",
|
|
"4. **Business Applications**:\n",
|
|
" - Sales performance analysis\n",
|
|
" - Customer segmentation\n",
|
|
" - Trend analysis and forecasting\n",
|
|
" - Cohort and retention analysis\n",
|
|
"\n",
|
|
"## Pivot Table Quick Reference\n",
|
|
"\n",
|
|
"```python\n",
|
|
"# Basic pivot table\n",
|
|
"df.pivot_table(values='sales', index='region', columns='product', aggfunc='sum')\n",
|
|
"\n",
|
|
"# Multiple aggregations\n",
|
|
"df.pivot_table(values='sales', index='region', aggfunc=['sum', 'mean', 'count'])\n",
|
|
"\n",
|
|
"# With margins (totals)\n",
|
|
"df.pivot_table(values='sales', index='region', columns='product', \n",
|
|
" aggfunc='sum', margins=True)\n",
|
|
"\n",
|
|
"# Cross-tabulation\n",
|
|
"pd.crosstab(df['region'], df['product'], normalize='index')\n",
|
|
"\n",
|
|
"# Reshaping\n",
|
|
"pd.melt(df, id_vars=['id'], value_vars=['col1', 'col2']) # Wide to long\n",
|
|
"df.pivot(index='date', columns='category', values='value') # Long to wide\n",
|
|
"```\n",
|
|
"\n",
|
|
"## Common Use Cases\n",
|
|
"\n",
|
|
"| Scenario | Best Tool | Key Parameters |\n",
|
|
"|----------|-----------|----------------|\n",
|
|
"| Sales by region/product | `pivot_table()` | `values='sales', index='region', columns='product'` |\n",
|
|
"| Frequency analysis | `crosstab()` | `normalize='index'` for percentages |\n",
|
|
"| Time series analysis | `pivot()` + `unstack()` | Handle dates in index |\n",
|
|
"| Data normalization | `melt()` | `id_vars` for identifiers |\n",
|
|
"| Multi-level analysis | Hierarchical indexing | Multiple columns in `index`/`columns` |"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.13.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|