1149 lines
44 KiB
Text
Executable file
1149 lines
44 KiB
Text
Executable file
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Session 1 - DataFrames - Lesson 10: Time Series Analysis\n",
|
|
"\n",
|
|
"## Learning Objectives\n",
|
|
"- Master datetime indexing and time-based operations\n",
|
|
"- Learn resampling and frequency conversion techniques\n",
|
|
"- Understand rolling calculations and window functions\n",
|
|
"- Practice with seasonal analysis and trend decomposition\n",
|
|
"- Apply time series techniques to business forecasting scenarios\n",
|
|
"\n",
|
|
"## Prerequisites\n",
|
|
"- Completed Lessons 1-9\n",
|
|
"- Understanding of datetime concepts\n",
|
|
"- Basic knowledge of statistics (helpful for trend analysis)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Import required libraries\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import seaborn as sns\n",
|
|
"from datetime import datetime, timedelta\n",
|
|
"import warnings\n",
|
|
"warnings.filterwarnings('ignore')\n",
|
|
"\n",
|
|
"# Set display options\n",
|
|
"pd.set_option('display.max_columns', None)\n",
|
|
"pd.set_option('display.max_rows', 20)\n",
|
|
"plt.style.use('seaborn-v0_8')\n",
|
|
"%matplotlib inline\n",
|
|
"\n",
|
|
"print(\"Libraries loaded successfully!\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Creating Time Series Data\n",
|
|
"\n",
|
|
"Let's create realistic time series datasets for analysis."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Create comprehensive time series dataset\n",
|
|
"np.random.seed(42)\n",
|
|
"\n",
|
|
"# Generate 2 years of daily data\n",
|
|
"start_date = '2022-01-01'\n",
|
|
"end_date = '2023-12-31'\n",
|
|
"date_range = pd.date_range(start=start_date, end=end_date, freq='D')\n",
|
|
"\n",
|
|
"# Create realistic sales data with trends and seasonality\n",
|
|
"n_days = len(date_range)\n",
|
|
"base_sales = 1000\n",
|
|
"\n",
|
|
"# Add trend (gradual increase over time)\n",
|
|
"trend = np.linspace(0, 300, n_days)\n",
|
|
"\n",
|
|
"# Add seasonality (weekly and monthly patterns)\n",
|
|
"daily_pattern = np.sin(2 * np.pi * np.arange(n_days) / 7) * 100 # Weekly pattern\n",
|
|
"monthly_pattern = np.sin(2 * np.pi * np.arange(n_days) / 30.44) * 150 # Monthly pattern\n",
|
|
"yearly_pattern = np.sin(2 * np.pi * np.arange(n_days) / 365.25) * 200 # Yearly pattern\n",
|
|
"\n",
|
|
"# Add random noise\n",
|
|
"noise = np.random.normal(0, 80, n_days)\n",
|
|
"\n",
|
|
"# Combine all components\n",
|
|
"sales = base_sales + trend + daily_pattern + monthly_pattern + yearly_pattern + noise\n",
|
|
"sales = np.maximum(sales, 0) # Ensure non-negative sales\n",
|
|
"\n",
|
|
"# Create DataFrame\n",
|
|
"ts_data = pd.DataFrame({\n",
|
|
" 'date': date_range,\n",
|
|
" 'sales': sales,\n",
|
|
" 'customers': np.random.poisson(50, n_days) + (sales / 50).astype(int),\n",
|
|
" 'marketing_spend': np.random.gamma(2, 20, n_days),\n",
|
|
" 'temperature': 20 + 10 * np.sin(2 * np.pi * np.arange(n_days) / 365.25) + np.random.normal(0, 5, n_days),\n",
|
|
" 'is_weekend': pd.Series(date_range).dt.dayofweek >= 5,\n",
|
|
" 'is_holiday': np.random.choice([True, False], n_days, p=[0.05, 0.95])\n",
|
|
"})\n",
|
|
"\n",
|
|
"# Set date as index\n",
|
|
"ts_data.set_index('date', inplace=True)\n",
|
|
"\n",
|
|
"print(\"Time series dataset created:\")\n",
|
|
"print(f\"Shape: {ts_data.shape}\")\n",
|
|
"print(f\"Date range: {ts_data.index.min()} to {ts_data.index.max()}\")\n",
|
|
"print(\"\\nFirst few rows:\")\n",
|
|
"print(ts_data.head())\n",
|
|
"print(\"\\nData types:\")\n",
|
|
"print(ts_data.dtypes)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 1. DateTime Indexing and Basic Operations\n",
|
|
"\n",
|
|
"Working with datetime indices and time-based selection."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Basic datetime index operations\n",
|
|
"print(\"=== DATETIME INDEX OPERATIONS ===\")\n",
|
|
"\n",
|
|
"# Index information\n",
|
|
"print(f\"Index type: {type(ts_data.index)}\")\n",
|
|
"print(f\"Index frequency: {ts_data.index.freq}\")\n",
|
|
"print(f\"Index is monotonic increasing: {ts_data.index.is_monotonic_increasing}\")\n",
|
|
"print(f\"Index is monotonic decreasing: {ts_data.index.is_monotonic_decreasing}\")\n",
|
|
"print(f\"Index has duplicates: {ts_data.index.has_duplicates}\")\n",
|
|
"\n",
|
|
"# Time-based selection\n",
|
|
"print(\"\\n--- Time-based Selection ---\")\n",
|
|
"\n",
|
|
"# Select specific year\n",
|
|
"sales_2022 = ts_data.loc['2022']\n",
|
|
"print(f\"2022 data shape: {sales_2022.shape}\")\n",
|
|
"print(f\"2022 average daily sales: {sales_2022['sales'].mean():.2f}\")\n",
|
|
"\n",
|
|
"# Select specific month\n",
|
|
"jan_2023 = ts_data.loc['2023-01']\n",
|
|
"print(f\"\\nJanuary 2023 data shape: {jan_2023.shape}\")\n",
|
|
"print(f\"January 2023 total sales: {jan_2023['sales'].sum():.2f}\")\n",
|
|
"\n",
|
|
"# Select date range\n",
|
|
"q1_2023 = ts_data.loc['2023-01-01':'2023-03-31']\n",
|
|
"print(f\"\\nQ1 2023 data shape: {q1_2023.shape}\")\n",
|
|
"print(f\"Q1 2023 average sales: {q1_2023['sales'].mean():.2f}\")\n",
|
|
"\n",
|
|
"# Recent data (last 30 days)\n",
|
|
"recent_data = ts_data.tail(30)\n",
|
|
"print(f\"\\nLast 30 days average sales: {recent_data['sales'].mean():.2f}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# DateTime component extraction\n",
|
|
"print(\"=== DATETIME COMPONENT EXTRACTION ===\")\n",
|
|
"\n",
|
|
"# Extract various date components\n",
|
|
"ts_enhanced = ts_data.copy()\n",
|
|
"ts_enhanced['year'] = ts_enhanced.index.year\n",
|
|
"ts_enhanced['month'] = ts_enhanced.index.month\n",
|
|
"ts_enhanced['quarter'] = ts_enhanced.index.quarter\n",
|
|
"ts_enhanced['day_of_week'] = ts_enhanced.index.dayofweek # 0=Monday, 6=Sunday\n",
|
|
"ts_enhanced['day_name'] = ts_enhanced.index.day_name()\n",
|
|
"ts_enhanced['month_name'] = ts_enhanced.index.month_name()\n",
|
|
"ts_enhanced['week_of_year'] = ts_enhanced.index.isocalendar().week\n",
|
|
"ts_enhanced['day_of_year'] = ts_enhanced.index.dayofyear\n",
|
|
"ts_enhanced['is_month_start'] = ts_enhanced.index.is_month_start\n",
|
|
"ts_enhanced['is_month_end'] = ts_enhanced.index.is_month_end\n",
|
|
"ts_enhanced['is_quarter_start'] = ts_enhanced.index.is_quarter_start\n",
|
|
"ts_enhanced['is_quarter_end'] = ts_enhanced.index.is_quarter_end\n",
|
|
"\n",
|
|
"print(\"Enhanced dataset with datetime components:\")\n",
|
|
"print(ts_enhanced[['sales', 'year', 'month', 'quarter', 'day_name', 'week_of_year']].head())\n",
|
|
"\n",
|
|
"# Analyze patterns by day of week\n",
|
|
"print(\"\\nSales patterns by day of week:\")\n",
|
|
"dow_analysis = ts_enhanced.groupby('day_name')['sales'].agg(['mean', 'std', 'count'])\n",
|
|
"# Reorder by weekday\n",
|
|
"day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']\n",
|
|
"dow_analysis = dow_analysis.reindex(day_order)\n",
|
|
"print(dow_analysis.round(2))\n",
|
|
"\n",
|
|
"# Monthly patterns\n",
|
|
"print(\"\\nSales patterns by month:\")\n",
|
|
"monthly_analysis = ts_enhanced.groupby('month_name')['sales'].agg(['mean', 'std'])\n",
|
|
"month_order = ['January', 'February', 'March', 'April', 'May', 'June',\n",
|
|
" 'July', 'August', 'September', 'October', 'November', 'December']\n",
|
|
"monthly_analysis = monthly_analysis.reindex([m for m in month_order if m in monthly_analysis.index])\n",
|
|
"print(monthly_analysis.round(2))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Time series visualization\n",
|
|
"print(\"=== TIME SERIES VISUALIZATION ===\")\n",
|
|
"\n",
|
|
"# Create comprehensive time series plots\n",
|
|
"fig, axes = plt.subplots(2, 2, figsize=(15, 10))\n",
|
|
"\n",
|
|
"# Plot 1: Daily sales over time\n",
|
|
"ts_data['sales'].plot(ax=axes[0, 0], title='Daily Sales Over Time', alpha=0.7)\n",
|
|
"axes[0, 0].set_ylabel('Sales ($)')\n",
|
|
"axes[0, 0].grid(True, alpha=0.3)\n",
|
|
"\n",
|
|
"# Plot 2: Monthly aggregated sales\n",
|
|
"monthly_sales = ts_data['sales'].resample('M').sum()\n",
|
|
"monthly_sales.plot(ax=axes[0, 1], title='Monthly Sales', marker='o')\n",
|
|
"axes[0, 1].set_ylabel('Monthly Sales ($)')\n",
|
|
"axes[0, 1].grid(True, alpha=0.3)\n",
|
|
"\n",
|
|
"# Plot 3: Sales vs customers correlation\n",
|
|
"axes[1, 0].scatter(ts_data['customers'], ts_data['sales'], alpha=0.5)\n",
|
|
"axes[1, 0].set_title('Sales vs Customers')\n",
|
|
"axes[1, 0].set_xlabel('Number of Customers')\n",
|
|
"axes[1, 0].set_ylabel('Sales ($)')\n",
|
|
"axes[1, 0].grid(True, alpha=0.3)\n",
|
|
"\n",
|
|
"# Plot 4: Seasonal pattern (by month)\n",
|
|
"ts_enhanced.groupby('month')['sales'].mean().plot(ax=axes[1, 1], kind='bar', \n",
|
|
" title='Average Sales by Month')\n",
|
|
"axes[1, 1].set_ylabel('Average Sales ($)')\n",
|
|
"axes[1, 1].set_xlabel('Month')\n",
|
|
"axes[1, 1].tick_params(axis='x', rotation=45)\n",
|
|
"\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()\n",
|
|
"\n",
|
|
"# Summary statistics\n",
|
|
"print(\"\\nTime series summary statistics:\")\n",
|
|
"print(ts_data['sales'].describe())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 2. Resampling and Frequency Conversion\n",
|
|
"\n",
|
|
"Converting between different time frequencies and aggregating data."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Basic resampling operations\n",
|
|
"print(\"=== BASIC RESAMPLING ===\")\n",
|
|
"\n",
|
|
"# Resample to different frequencies\n",
|
|
"weekly_data = ts_data.resample('W').agg({\n",
|
|
" 'sales': 'sum',\n",
|
|
" 'customers': 'sum',\n",
|
|
" 'marketing_spend': 'sum',\n",
|
|
" 'temperature': 'mean'\n",
|
|
"})\n",
|
|
"\n",
|
|
"print(\"Weekly resampled data:\")\n",
|
|
"print(weekly_data.head(10))\n",
|
|
"\n",
|
|
"# Monthly resampling with multiple aggregations\n",
|
|
"monthly_data = ts_data.resample('M').agg({\n",
|
|
" 'sales': ['sum', 'mean', 'std', 'min', 'max'],\n",
|
|
" 'customers': ['sum', 'mean'],\n",
|
|
" 'marketing_spend': 'sum',\n",
|
|
" 'temperature': 'mean'\n",
|
|
"})\n",
|
|
"\n",
|
|
"print(\"\\nMonthly resampled data (first 6 months):\")\n",
|
|
"print(monthly_data.head(6))\n",
|
|
"\n",
|
|
"# Quarterly resampling\n",
|
|
"quarterly_data = ts_data.resample('Q').agg({\n",
|
|
" 'sales': 'sum',\n",
|
|
" 'customers': 'sum',\n",
|
|
" 'marketing_spend': 'sum'\n",
|
|
"})\n",
|
|
"\n",
|
|
"print(\"\\nQuarterly resampled data:\")\n",
|
|
"print(quarterly_data)\n",
|
|
"\n",
|
|
"# Year-over-year comparison\n",
|
|
"yearly_data = ts_data.resample('Y').agg({\n",
|
|
" 'sales': 'sum',\n",
|
|
" 'customers': 'sum',\n",
|
|
" 'marketing_spend': 'sum'\n",
|
|
"})\n",
|
|
"\n",
|
|
"print(\"\\nYearly resampled data:\")\n",
|
|
"print(yearly_data)\n",
|
|
"\n",
|
|
"# Calculate year-over-year growth\n",
|
|
"if len(yearly_data) > 1:\n",
|
|
" yoy_growth = yearly_data.pct_change() * 100\n",
|
|
" print(\"\\nYear-over-year growth (%):\")\n",
|
|
" print(yoy_growth.round(2))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Advanced resampling techniques\n",
|
|
"print(\"=== ADVANCED RESAMPLING ===\")\n",
|
|
"\n",
|
|
"# Custom aggregation functions\n",
|
|
"def coefficient_of_variation(series):\n",
|
|
" \"\"\"Calculate coefficient of variation\"\"\"\n",
|
|
" return series.std() / series.mean() if series.mean() != 0 else 0\n",
|
|
"\n",
|
|
"def sales_volatility(series):\n",
|
|
" \"\"\"Calculate sales volatility (std/mean)\"\"\"\n",
|
|
" return series.std()\n",
|
|
"\n",
|
|
"# Custom resampling with multiple functions\n",
|
|
"custom_monthly = ts_data.resample('M').agg({\n",
|
|
" 'sales': ['sum', 'mean', coefficient_of_variation, sales_volatility],\n",
|
|
" 'customers': ['sum', 'mean'],\n",
|
|
" 'marketing_spend': 'sum'\n",
|
|
"})\n",
|
|
"\n",
|
|
"print(\"Custom monthly aggregations:\")\n",
|
|
"print(custom_monthly.round(3))\n",
|
|
"\n",
|
|
"# Resampling with different anchor points\n",
|
|
"# Weekly data starting on different days\n",
|
|
"weekly_sunday = ts_data.resample('W-SUN')['sales'].sum() # Week ending Sunday\n",
|
|
"weekly_monday = ts_data.resample('W-MON')['sales'].sum() # Week ending Monday\n",
|
|
"\n",
|
|
"print(\"\\nWeekly totals comparison (first 10 weeks):\")\n",
|
|
"weekly_comparison = pd.DataFrame({\n",
|
|
" 'Week_End_Sunday': weekly_sunday,\n",
|
|
" 'Week_End_Monday': weekly_monday\n",
|
|
"})\n",
|
|
"print(weekly_comparison.head(10))\n",
|
|
"\n",
|
|
"# Business day resampling\n",
|
|
"business_weekly = ts_data.resample('B').mean() # Business days only\n",
|
|
"print(f\"\\nBusiness days data shape: {business_weekly.shape}\")\n",
|
|
"print(\"Business days average (first 10):\")\n",
|
|
"print(business_weekly[['sales', 'customers']].head(10))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Upsampling and downsampling\n",
|
|
"print(\"=== UPSAMPLING AND DOWNSAMPLING ===\")\n",
|
|
"\n",
|
|
"# Downsample to weekly and visualize\n",
|
|
"weekly_sales = ts_data['sales'].resample('W').sum()\n",
|
|
"\n",
|
|
"# Upsample weekly back to daily (forward fill)\n",
|
|
"upsampled_ffill = weekly_sales.resample('D').ffill()\n",
|
|
"\n",
|
|
"# Upsample with interpolation\n",
|
|
"upsampled_interp = weekly_sales.resample('D').interpolate()\n",
|
|
"\n",
|
|
"print(\"Upsampling comparison (sample period):\")\n",
|
|
"\n",
|
|
"# Fix: Use the date range directly, not the filtered DataFrame\n",
|
|
"start_date = '2023-01-01'\n",
|
|
"end_date = '2023-01-31'\n",
|
|
"\n",
|
|
"upsample_comparison = pd.DataFrame({\n",
|
|
" 'Original_Daily': ts_data.loc[start_date:end_date, 'sales'],\n",
|
|
" 'Weekly_Upsampled_FFill': upsampled_ffill.loc[start_date:end_date],\n",
|
|
" 'Weekly_Upsampled_Interp': upsampled_interp.loc[start_date:end_date]\n",
|
|
"})\n",
|
|
"\n",
|
|
"print(upsample_comparison.head(15))\n",
|
|
"\n",
|
|
"# Visualize upsampling methods\n",
|
|
"plt.figure(figsize=(12, 6))\n",
|
|
"plt.plot(upsample_comparison.index, upsample_comparison['Original_Daily'], \n",
|
|
" label='Original Daily', alpha=0.7)\n",
|
|
"plt.plot(upsample_comparison.index, upsample_comparison['Weekly_Upsampled_FFill'], \n",
|
|
" label='Forward Fill', linestyle='--')\n",
|
|
"plt.plot(upsample_comparison.index, upsample_comparison['Weekly_Upsampled_Interp'], \n",
|
|
" label='Interpolated', linestyle='-.')\n",
|
|
"plt.title('Upsampling Methods Comparison')\n",
|
|
"plt.xlabel('Date')\n",
|
|
"plt.ylabel('Sales ($)')\n",
|
|
"plt.legend()\n",
|
|
"plt.grid(True, alpha=0.3)\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 3. Rolling Calculations and Window Functions\n",
|
|
"\n",
|
|
"Moving averages, rolling statistics, and window-based analysis."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Basic rolling calculations\n",
|
|
"print(\"=== BASIC ROLLING CALCULATIONS ===\")\n",
|
|
"\n",
|
|
"# Calculate various rolling statistics\n",
|
|
"rolling_data = ts_data.copy()\n",
|
|
"\n",
|
|
"# Rolling means (moving averages)\n",
|
|
"rolling_data['sales_7d_avg'] = rolling_data['sales'].rolling(window=7).mean()\n",
|
|
"rolling_data['sales_30d_avg'] = rolling_data['sales'].rolling(window=30).mean()\n",
|
|
"rolling_data['sales_90d_avg'] = rolling_data['sales'].rolling(window=90).mean()\n",
|
|
"\n",
|
|
"# Rolling standard deviation (volatility)\n",
|
|
"rolling_data['sales_7d_std'] = rolling_data['sales'].rolling(window=7).std()\n",
|
|
"rolling_data['sales_30d_std'] = rolling_data['sales'].rolling(window=30).std()\n",
|
|
"\n",
|
|
"# Rolling min/max\n",
|
|
"rolling_data['sales_30d_min'] = rolling_data['sales'].rolling(window=30).min()\n",
|
|
"rolling_data['sales_30d_max'] = rolling_data['sales'].rolling(window=30).max()\n",
|
|
"\n",
|
|
"print(\"Rolling statistics (last 10 days):\")\n",
|
|
"rolling_cols = ['sales', 'sales_7d_avg', 'sales_30d_avg', 'sales_7d_std', 'sales_30d_std']\n",
|
|
"print(rolling_data[rolling_cols].tail(10).round(2))\n",
|
|
"\n",
|
|
"# Rolling sum for cumulative analysis\n",
|
|
"rolling_data['sales_7d_sum'] = rolling_data['sales'].rolling(window=7).sum()\n",
|
|
"rolling_data['sales_30d_sum'] = rolling_data['sales'].rolling(window=30).sum()\n",
|
|
"\n",
|
|
"print(\"\\nRolling sums (last 5 days):\")\n",
|
|
"print(rolling_data[['sales', 'sales_7d_sum', 'sales_30d_sum']].tail(5).round(0))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Advanced rolling calculations\n",
|
|
"print(\"=== ADVANCED ROLLING CALCULATIONS ===\")\n",
|
|
"\n",
|
|
"# Rolling correlation between variables\n",
|
|
"rolling_data['sales_customers_corr_30d'] = rolling_data['sales'].rolling(window=30).corr(rolling_data['customers'])\n",
|
|
"rolling_data['sales_marketing_corr_30d'] = rolling_data['sales'].rolling(window=30).corr(rolling_data['marketing_spend'])\n",
|
|
"\n",
|
|
"print(\"Rolling correlations (last 10 days):\")\n",
|
|
"corr_cols = ['sales_customers_corr_30d', 'sales_marketing_corr_30d']\n",
|
|
"print(rolling_data[corr_cols].tail(10).round(3))\n",
|
|
"\n",
|
|
"# Rolling quantiles\n",
|
|
"rolling_data['sales_30d_q25'] = rolling_data['sales'].rolling(window=30).quantile(0.25)\n",
|
|
"rolling_data['sales_30d_q75'] = rolling_data['sales'].rolling(window=30).quantile(0.75)\n",
|
|
"rolling_data['sales_30d_median'] = rolling_data['sales'].rolling(window=30).median()\n",
|
|
"\n",
|
|
"print(\"\\nRolling quantiles (last 5 days):\")\n",
|
|
"quantile_cols = ['sales', 'sales_30d_q25', 'sales_30d_median', 'sales_30d_q75']\n",
|
|
"print(rolling_data[quantile_cols].tail(5).round(2))\n",
|
|
"\n",
|
|
"# Custom rolling functions\n",
|
|
"def rolling_cv(series):\n",
|
|
" \"\"\"Rolling coefficient of variation\"\"\"\n",
|
|
" return series.std() / series.mean() if series.mean() != 0 else 0\n",
|
|
"\n",
|
|
"def rolling_skewness(series):\n",
|
|
" \"\"\"Rolling skewness\"\"\"\n",
|
|
" return series.skew()\n",
|
|
"\n",
|
|
"rolling_data['sales_30d_cv'] = rolling_data['sales'].rolling(window=30).apply(rolling_cv)\n",
|
|
"rolling_data['sales_30d_skew'] = rolling_data['sales'].rolling(window=30).apply(rolling_skewness)\n",
|
|
"\n",
|
|
"print(\"\\nCustom rolling statistics (last 5 days):\")\n",
|
|
"custom_cols = ['sales_30d_cv', 'sales_30d_skew']\n",
|
|
"print(rolling_data[custom_cols].tail(5).round(3))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Exponentially weighted functions\n",
|
|
"print(\"=== EXPONENTIALLY WEIGHTED FUNCTIONS ===\")\n",
|
|
"\n",
|
|
"# Exponentially weighted moving average (EWMA)\n",
|
|
"rolling_data['sales_ewm_10'] = rolling_data['sales'].ewm(span=10).mean()\n",
|
|
"rolling_data['sales_ewm_30'] = rolling_data['sales'].ewm(span=30).mean()\n",
|
|
"\n",
|
|
"# Exponentially weighted standard deviation\n",
|
|
"rolling_data['sales_ewm_std_10'] = rolling_data['sales'].ewm(span=10).std()\n",
|
|
"\n",
|
|
"print(\"Exponentially weighted statistics (last 10 days):\")\n",
|
|
"ewm_cols = ['sales', 'sales_7d_avg', 'sales_ewm_10', 'sales_ewm_30']\n",
|
|
"print(rolling_data[ewm_cols].tail(10).round(2))\n",
|
|
"\n",
|
|
"# Visualize different smoothing methods\n",
|
|
"plt.figure(figsize=(15, 8))\n",
|
|
"\n",
|
|
"# Plot last 90 days for clarity\n",
|
|
"recent_period = rolling_data.tail(90)\n",
|
|
"\n",
|
|
"plt.plot(recent_period.index, recent_period['sales'], label='Original Sales', alpha=0.7)\n",
|
|
"plt.plot(recent_period.index, recent_period['sales_7d_avg'], label='7-day MA', linewidth=2)\n",
|
|
"plt.plot(recent_period.index, recent_period['sales_30d_avg'], label='30-day MA', linewidth=2)\n",
|
|
"plt.plot(recent_period.index, recent_period['sales_ewm_10'], label='EWM (span=10)', linewidth=2)\n",
|
|
"\n",
|
|
"plt.title('Sales Smoothing Methods Comparison (Last 90 Days)')\n",
|
|
"plt.xlabel('Date')\n",
|
|
"plt.ylabel('Sales ($)')\n",
|
|
"plt.legend()\n",
|
|
"plt.grid(True, alpha=0.3)\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()\n",
|
|
"\n",
|
|
"# Calculate lag between different smoothing methods\n",
|
|
"print(\"\\nSmoothing method responsiveness (correlation with original):\")\n",
|
|
"responsiveness = {\n",
|
|
" '7-day MA': rolling_data['sales'].corr(rolling_data['sales_7d_avg']),\n",
|
|
" '30-day MA': rolling_data['sales'].corr(rolling_data['sales_30d_avg']),\n",
|
|
" 'EWM (span=10)': rolling_data['sales'].corr(rolling_data['sales_ewm_10']),\n",
|
|
" 'EWM (span=30)': rolling_data['sales'].corr(rolling_data['sales_ewm_30'])\n",
|
|
"}\n",
|
|
"\n",
|
|
"for method, corr in responsiveness.items():\n",
|
|
" print(f\"{method}: {corr:.4f}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 4. Seasonal Analysis and Decomposition\n",
|
|
"\n",
|
|
"Analyzing seasonal patterns and decomposing time series."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Seasonal pattern analysis\n",
|
|
"print(\"=== SEASONAL PATTERN ANALYSIS ===\")\n",
|
|
"\n",
|
|
"# Add more detailed time components\n",
|
|
"seasonal_data = ts_data.copy()\n",
|
|
"seasonal_data['month'] = seasonal_data.index.month\n",
|
|
"seasonal_data['quarter'] = seasonal_data.index.quarter\n",
|
|
"seasonal_data['day_of_week'] = seasonal_data.index.dayofweek\n",
|
|
"seasonal_data['week_of_year'] = seasonal_data.index.isocalendar().week\n",
|
|
"seasonal_data['day_of_year'] = seasonal_data.index.dayofyear\n",
|
|
"\n",
|
|
"# Monthly seasonality\n",
|
|
"monthly_pattern = seasonal_data.groupby('month')['sales'].agg(['mean', 'std', 'count'])\n",
|
|
"print(\"Monthly sales patterns:\")\n",
|
|
"print(monthly_pattern.round(2))\n",
|
|
"\n",
|
|
"# Day of week patterns\n",
|
|
"dow_pattern = seasonal_data.groupby('day_of_week')['sales'].agg(['mean', 'std'])\n",
|
|
"dow_pattern.index = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']\n",
|
|
"print(\"\\nDay of week patterns:\")\n",
|
|
"print(dow_pattern.round(2))\n",
|
|
"\n",
|
|
"# Weekly patterns throughout the year\n",
|
|
"weekly_pattern = seasonal_data.groupby('week_of_year')['sales'].mean()\n",
|
|
"print(\"\\nWeekly pattern statistics:\")\n",
|
|
"print(f\"Highest week: Week {weekly_pattern.idxmax()} (${weekly_pattern.max():.0f})\")\n",
|
|
"print(f\"Lowest week: Week {weekly_pattern.idxmin()} (${weekly_pattern.min():.0f})\")\n",
|
|
"print(f\"Weekly variation: {weekly_pattern.std():.2f}\")\n",
|
|
"\n",
|
|
"# Quarterly analysis\n",
|
|
"quarterly_pattern = seasonal_data.groupby('quarter')['sales'].agg(['mean', 'sum', 'std'])\n",
|
|
"print(\"\\nQuarterly patterns:\")\n",
|
|
"print(quarterly_pattern.round(2))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Simple seasonal decomposition\n",
|
|
"print(\"=== SEASONAL DECOMPOSITION ===\")\n",
|
|
"\n",
|
|
"# Manual decomposition approach\n",
|
|
"def simple_decompose(series, period=365):\n",
|
|
" \"\"\"Simple seasonal decomposition\"\"\"\n",
|
|
" # Trend (using centered moving average)\n",
|
|
" trend = series.rolling(window=period, center=True).mean()\n",
|
|
" \n",
|
|
" # Detrended series\n",
|
|
" detrended = series - trend\n",
|
|
" \n",
|
|
" # Seasonal component (average for each period)\n",
|
|
" seasonal_avg = detrended.groupby(detrended.index.dayofyear).mean()\n",
|
|
" seasonal = pd.Series(index=series.index, dtype=float)\n",
|
|
" for idx in series.index:\n",
|
|
" day_of_year = idx.dayofyear\n",
|
|
" if day_of_year in seasonal_avg.index:\n",
|
|
" seasonal.loc[idx] = seasonal_avg.loc[day_of_year]\n",
|
|
" else: # Handle leap year day\n",
|
|
" seasonal.loc[idx] = 0\n",
|
|
" \n",
|
|
" # Residual (what's left after removing trend and seasonality)\n",
|
|
" residual = series - trend - seasonal\n",
|
|
" \n",
|
|
" return trend, seasonal, residual\n",
|
|
"\n",
|
|
"# Decompose sales data\n",
|
|
"trend, seasonal, residual = simple_decompose(ts_data['sales'])\n",
|
|
"\n",
|
|
"# Create decomposition DataFrame\n",
|
|
"decomposition = pd.DataFrame({\n",
|
|
" 'original': ts_data['sales'],\n",
|
|
" 'trend': trend,\n",
|
|
" 'seasonal': seasonal,\n",
|
|
" 'residual': residual\n",
|
|
"})\n",
|
|
"\n",
|
|
"print(\"Decomposition summary:\")\n",
|
|
"print(decomposition.describe().round(2))\n",
|
|
"\n",
|
|
"# Visualize decomposition\n",
|
|
"fig, axes = plt.subplots(4, 1, figsize=(15, 12))\n",
|
|
"\n",
|
|
"# Original series\n",
|
|
"decomposition['original'].plot(ax=axes[0], title='Original Sales Data')\n",
|
|
"axes[0].set_ylabel('Sales ($)')\n",
|
|
"axes[0].grid(True, alpha=0.3)\n",
|
|
"\n",
|
|
"# Trend\n",
|
|
"decomposition['trend'].plot(ax=axes[1], title='Trend Component', color='red')\n",
|
|
"axes[1].set_ylabel('Trend ($)')\n",
|
|
"axes[1].grid(True, alpha=0.3)\n",
|
|
"\n",
|
|
"# Seasonal\n",
|
|
"decomposition['seasonal'].plot(ax=axes[2], title='Seasonal Component', color='green')\n",
|
|
"axes[2].set_ylabel('Seasonal ($)')\n",
|
|
"axes[2].grid(True, alpha=0.3)\n",
|
|
"\n",
|
|
"# Residual\n",
|
|
"decomposition['residual'].plot(ax=axes[3], title='Residual Component', color='purple')\n",
|
|
"axes[3].set_ylabel('Residual ($)')\n",
|
|
"axes[3].set_xlabel('Date')\n",
|
|
"axes[3].grid(True, alpha=0.3)\n",
|
|
"\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()\n",
|
|
"\n",
|
|
"print(\"\\nDecomposition insights:\")\n",
|
|
"print(f\"Trend contribution: {trend.std():.2f} (std dev)\")\n",
|
|
"print(f\"Seasonal contribution: {seasonal.std():.2f} (std dev)\")\n",
|
|
"print(f\"Residual contribution: {residual.std():.2f} (std dev)\")\n",
|
|
"print(f\"Total variation: {ts_data['sales'].std():.2f} (std dev)\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Advanced seasonal analysis\n",
|
|
"print(\"=== ADVANCED SEASONAL ANALYSIS ===\")\n",
|
|
"\n",
|
|
"# Year-over-year comparison\n",
|
|
"yoy_comparison = pd.DataFrame()\n",
|
|
"for year in ts_data.index.year.unique():\n",
|
|
" year_data = ts_data[ts_data.index.year == year]['sales']\n",
|
|
" year_data.index = year_data.index.dayofyear\n",
|
|
" yoy_comparison[f'Year_{year}'] = year_data\n",
|
|
"\n",
|
|
"print(\"Year-over-year comparison (sample days):\")\n",
|
|
"print(yoy_comparison.head(10).round(2))\n",
|
|
"\n",
|
|
"# Calculate year-over-year changes\n",
|
|
"if len(yoy_comparison.columns) > 1:\n",
|
|
" yoy_change = yoy_comparison.pct_change(axis=1) * 100\n",
|
|
" print(\"\\nYear-over-year change statistics:\")\n",
|
|
" for col in yoy_change.columns[1:]:\n",
|
|
" print(f\"{col}: mean={yoy_change[col].mean():.2f}%, std={yoy_change[col].std():.2f}%\")\n",
|
|
"\n",
|
|
"# Seasonal strength measurement\n",
|
|
"def seasonal_strength(series, period=365):\n",
|
|
" \"\"\"Calculate seasonal strength (0 = no seasonality, 1 = pure seasonality)\"\"\"\n",
|
|
" # Detrend the series\n",
|
|
" trend = series.rolling(window=period, center=True).mean()\n",
|
|
" detrended = series - trend\n",
|
|
" \n",
|
|
" # Calculate seasonal component\n",
|
|
" seasonal_avg = detrended.groupby(detrended.index.dayofyear).mean()\n",
|
|
" seasonal_var = seasonal_avg.var()\n",
|
|
" \n",
|
|
" # Calculate residual variance\n",
|
|
" seasonal_full = pd.Series(index=series.index, dtype=float)\n",
|
|
" for idx in series.index:\n",
|
|
" day_of_year = idx.dayofyear\n",
|
|
" if day_of_year in seasonal_avg.index:\n",
|
|
" seasonal_full.loc[idx] = seasonal_avg.loc[day_of_year]\n",
|
|
" else:\n",
|
|
" seasonal_full.loc[idx] = 0\n",
|
|
" \n",
|
|
" residual = detrended - seasonal_full\n",
|
|
" residual_var = residual.var()\n",
|
|
" \n",
|
|
" # Seasonal strength\n",
|
|
" return seasonal_var / (seasonal_var + residual_var)\n",
|
|
"\n",
|
|
"sales_seasonal_strength = seasonal_strength(ts_data['sales'])\n",
|
|
"print(f\"\\nSales seasonal strength: {sales_seasonal_strength:.3f}\")\n",
|
|
"print(\"(0 = no seasonality, 1 = pure seasonality)\")\n",
|
|
"\n",
|
|
"# Identify most/least seasonal periods\n",
|
|
"monthly_seasonal = seasonal_data.groupby('month')['sales'].std()\n",
|
|
"print(f\"\\nMost variable month: {monthly_seasonal.idxmax()} (std: {monthly_seasonal.max():.2f})\")\n",
|
|
"print(f\"Least variable month: {monthly_seasonal.idxmin()} (std: {monthly_seasonal.min():.2f})\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 5. Business Applications and Forecasting\n",
|
|
"\n",
|
|
"Real-world time series analysis for business insights."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Business performance metrics\n",
|
|
"print(\"=== BUSINESS PERFORMANCE METRICS ===\")\n",
|
|
"\n",
|
|
"def calculate_business_metrics(df):\n",
|
|
" \"\"\"Calculate key business time series metrics\"\"\"\n",
|
|
" metrics = {}\n",
|
|
" \n",
|
|
" # Growth metrics\n",
|
|
" daily_sales = df['sales']\n",
|
|
" metrics['total_sales'] = daily_sales.sum()\n",
|
|
" metrics['avg_daily_sales'] = daily_sales.mean()\n",
|
|
" metrics['sales_growth_rate'] = (daily_sales.iloc[-30:].mean() / daily_sales.iloc[:30].mean() - 1) * 100\n",
|
|
" \n",
|
|
" # Volatility metrics\n",
|
|
" metrics['sales_volatility'] = daily_sales.std()\n",
|
|
" metrics['coefficient_of_variation'] = daily_sales.std() / daily_sales.mean()\n",
|
|
" \n",
|
|
" # Trend metrics\n",
|
|
" trend = daily_sales.rolling(window=30).mean()\n",
|
|
" metrics['trend_direction'] = 'Increasing' if trend.iloc[-1] > trend.iloc[-30] else 'Decreasing'\n",
|
|
" metrics['trend_strength'] = abs(trend.iloc[-1] - trend.iloc[-30]) / trend.iloc[-30] * 100\n",
|
|
" \n",
|
|
" # Customer metrics\n",
|
|
" metrics['avg_customers_per_day'] = df['customers'].mean()\n",
|
|
" metrics['sales_per_customer'] = df['sales'].sum() / df['customers'].sum()\n",
|
|
" \n",
|
|
" # Marketing efficiency\n",
|
|
" metrics['marketing_roi'] = df['sales'].sum() / df['marketing_spend'].sum()\n",
|
|
" \n",
|
|
" return metrics\n",
|
|
"\n",
|
|
"# Calculate metrics for different periods\n",
|
|
"overall_metrics = calculate_business_metrics(ts_data)\n",
|
|
"recent_metrics = calculate_business_metrics(ts_data.tail(90)) # Last 90 days\n",
|
|
"\n",
|
|
"print(\"Overall Performance Metrics:\")\n",
|
|
"for metric, value in overall_metrics.items():\n",
|
|
" if isinstance(value, float):\n",
|
|
" print(f\"{metric}: {value:.2f}\")\n",
|
|
" else:\n",
|
|
" print(f\"{metric}: {value}\")\n",
|
|
"\n",
|
|
"print(\"\\nRecent 90-day Performance Metrics:\")\n",
|
|
"for metric, value in recent_metrics.items():\n",
|
|
" if isinstance(value, float):\n",
|
|
" print(f\"{metric}: {value:.2f}\")\n",
|
|
" else:\n",
|
|
" print(f\"{metric}: {value}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Simple forecasting using historical patterns\n",
|
|
"print(\"=== SIMPLE FORECASTING ===\")\n",
|
|
"\n",
|
|
"def simple_forecast(series, periods=30, method='seasonal_naive'):\n",
|
|
" \"\"\"Simple forecasting methods\"\"\"\n",
|
|
" if method == 'naive':\n",
|
|
" # Naive: repeat last value\n",
|
|
" return pd.Series([series.iloc[-1]] * periods, \n",
|
|
" index=pd.date_range(series.index[-1] + pd.Timedelta(days=1), periods=periods))\n",
|
|
" \n",
|
|
" elif method == 'seasonal_naive':\n",
|
|
" # Seasonal naive: repeat same day from previous year\n",
|
|
" forecast_dates = pd.date_range(series.index[-1] + pd.Timedelta(days=1), periods=periods)\n",
|
|
" forecast_values = []\n",
|
|
" \n",
|
|
" for date in forecast_dates:\n",
|
|
" # Find same day of year from previous year\n",
|
|
" previous_year_date = date - pd.DateOffset(years=1)\n",
|
|
" if previous_year_date in series.index:\n",
|
|
" forecast_values.append(series.loc[previous_year_date])\n",
|
|
" else:\n",
|
|
" # Fallback to seasonal average\n",
|
|
" day_of_year = date.dayofyear\n",
|
|
" same_day_values = series[series.index.dayofyear == day_of_year]\n",
|
|
" if len(same_day_values) > 0:\n",
|
|
" forecast_values.append(same_day_values.mean())\n",
|
|
" else:\n",
|
|
" forecast_values.append(series.mean())\n",
|
|
" \n",
|
|
" return pd.Series(forecast_values, index=forecast_dates)\n",
|
|
" \n",
|
|
" elif method == 'moving_average':\n",
|
|
" # Moving average forecast\n",
|
|
" ma_value = series.tail(30).mean()\n",
|
|
" return pd.Series([ma_value] * periods,\n",
|
|
" index=pd.date_range(series.index[-1] + pd.Timedelta(days=1), periods=periods))\n",
|
|
" \n",
|
|
" elif method == 'trend':\n",
|
|
" # Linear trend forecast\n",
|
|
" from scipy import stats\n",
|
|
" x = np.arange(len(series))\n",
|
|
" slope, intercept, _, _, _ = stats.linregress(x, series.values)\n",
|
|
" \n",
|
|
" forecast_dates = pd.date_range(series.index[-1] + pd.Timedelta(days=1), periods=periods)\n",
|
|
" forecast_values = [slope * (len(series) + i) + intercept for i in range(1, periods + 1)]\n",
|
|
" \n",
|
|
" return pd.Series(forecast_values, index=forecast_dates)\n",
|
|
"\n",
|
|
"# Generate forecasts using different methods\n",
|
|
"forecast_periods = 30\n",
|
|
"sales_series = ts_data['sales']\n",
|
|
"\n",
|
|
"forecasts = {\n",
|
|
" 'Naive': simple_forecast(sales_series, forecast_periods, 'naive'),\n",
|
|
" 'Seasonal_Naive': simple_forecast(sales_series, forecast_periods, 'seasonal_naive'),\n",
|
|
" 'Moving_Average': simple_forecast(sales_series, forecast_periods, 'moving_average'),\n",
|
|
" 'Trend': simple_forecast(sales_series, forecast_periods, 'trend')\n",
|
|
"}\n",
|
|
"\n",
|
|
"print(f\"Forecasts for next {forecast_periods} days:\")\n",
|
|
"forecast_df = pd.DataFrame(forecasts)\n",
|
|
"print(forecast_df.head(10).round(2))\n",
|
|
"\n",
|
|
"print(\"\\nForecast summary statistics:\")\n",
|
|
"print(forecast_df.describe().round(2))\n",
|
|
"\n",
|
|
"# Visualize forecasts\n",
|
|
"plt.figure(figsize=(15, 8))\n",
|
|
"\n",
|
|
"# Plot historical data (last 90 days)\n",
|
|
"historical_period = sales_series.tail(90)\n",
|
|
"plt.plot(historical_period.index, historical_period.values, label='Historical Data', color='black', linewidth=2)\n",
|
|
"\n",
|
|
"# Plot forecasts\n",
|
|
"colors = ['red', 'blue', 'green', 'orange']\n",
|
|
"for i, (method, forecast) in enumerate(forecasts.items()):\n",
|
|
" plt.plot(forecast.index, forecast.values, label=f'{method} Forecast', \n",
|
|
" color=colors[i], linestyle='--', linewidth=2)\n",
|
|
"\n",
|
|
"plt.title('Sales Forecasting Comparison')\n",
|
|
"plt.xlabel('Date')\n",
|
|
"plt.ylabel('Sales ($)')\n",
|
|
"plt.legend()\n",
|
|
"plt.grid(True, alpha=0.3)\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Anomaly detection in time series\n",
|
|
"print(\"=== ANOMALY DETECTION ===\")\n",
|
|
"\n",
|
|
"def detect_anomalies(series, method='zscore', threshold=3):\n",
|
|
" \"\"\"Detect anomalies in time series\"\"\"\n",
|
|
" anomalies = pd.Series(False, index=series.index)\n",
|
|
" \n",
|
|
" if method == 'zscore':\n",
|
|
" # Z-score method\n",
|
|
" z_scores = np.abs((series - series.mean()) / series.std())\n",
|
|
" anomalies = z_scores > threshold\n",
|
|
" \n",
|
|
" elif method == 'iqr':\n",
|
|
" # Interquartile range method\n",
|
|
" Q1 = series.quantile(0.25)\n",
|
|
" Q3 = series.quantile(0.75)\n",
|
|
" IQR = Q3 - Q1\n",
|
|
" lower_bound = Q1 - 1.5 * IQR\n",
|
|
" upper_bound = Q3 + 1.5 * IQR\n",
|
|
" anomalies = (series < lower_bound) | (series > upper_bound)\n",
|
|
" \n",
|
|
" elif method == 'rolling':\n",
|
|
" # Rolling window method\n",
|
|
" rolling_mean = series.rolling(window=30).mean()\n",
|
|
" rolling_std = series.rolling(window=30).std()\n",
|
|
" z_scores = np.abs((series - rolling_mean) / rolling_std)\n",
|
|
" anomalies = z_scores > threshold\n",
|
|
" \n",
|
|
" return anomalies\n",
|
|
"\n",
|
|
"# Detect anomalies using different methods\n",
|
|
"anomaly_methods = ['zscore', 'iqr', 'rolling']\n",
|
|
"anomaly_results = {}\n",
|
|
"\n",
|
|
"for method in anomaly_methods:\n",
|
|
" anomalies = detect_anomalies(ts_data['sales'], method=method)\n",
|
|
" anomaly_results[method] = anomalies\n",
|
|
" print(f\"{method.upper()} method: {anomalies.sum()} anomalies detected\")\n",
|
|
"\n",
|
|
"# Combine anomaly detection results\n",
|
|
"anomaly_df = pd.DataFrame(anomaly_results)\n",
|
|
"anomaly_df['any_method'] = anomaly_df.any(axis=1)\n",
|
|
"anomaly_df['all_methods'] = anomaly_df[anomaly_methods].all(axis=1)\n",
|
|
"\n",
|
|
"print(f\"\\nAnomalies detected by any method: {anomaly_df['any_method'].sum()}\")\n",
|
|
"print(f\"Anomalies detected by all methods: {anomaly_df['all_methods'].sum()}\")\n",
|
|
"\n",
|
|
"# Show anomalous dates\n",
|
|
"severe_anomalies = ts_data[anomaly_df['all_methods']]\n",
|
|
"if len(severe_anomalies) > 0:\n",
|
|
" print(\"\\nSevere anomalies (detected by all methods):\")\n",
|
|
" print(severe_anomalies[['sales', 'customers', 'marketing_spend']].round(2))\n",
|
|
"\n",
|
|
"# Visualize anomalies\n",
|
|
"plt.figure(figsize=(15, 8))\n",
|
|
"\n",
|
|
"# Plot sales data\n",
|
|
"plt.plot(ts_data.index, ts_data['sales'], label='Sales Data', alpha=0.7)\n",
|
|
"\n",
|
|
"# Highlight anomalies\n",
|
|
"for method in anomaly_methods:\n",
|
|
" anomaly_dates = ts_data.index[anomaly_results[method]]\n",
|
|
" anomaly_values = ts_data.loc[anomaly_dates, 'sales']\n",
|
|
" plt.scatter(anomaly_dates, anomaly_values, label=f'{method.upper()} Anomalies', alpha=0.7, s=30)\n",
|
|
"\n",
|
|
"plt.title('Sales Data with Anomaly Detection')\n",
|
|
"plt.xlabel('Date')\n",
|
|
"plt.ylabel('Sales ($)')\n",
|
|
"plt.legend()\n",
|
|
"plt.grid(True, alpha=0.3)\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()\n",
|
|
"\n",
|
|
"# Anomaly statistics\n",
|
|
"print(\"\\nAnomaly statistics:\")\n",
|
|
"for method in anomaly_methods:\n",
|
|
" anomaly_sales = ts_data.loc[anomaly_results[method], 'sales']\n",
|
|
" if len(anomaly_sales) > 0:\n",
|
|
" print(f\"{method.upper()}: mean=${anomaly_sales.mean():.2f}, std=${anomaly_sales.std():.2f}\")\n",
|
|
" else:\n",
|
|
" print(f\"{method.upper()}: No anomalies detected\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Practice Exercises\n",
|
|
"\n",
|
|
"Apply time series analysis to complex business scenarios:"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Exercise 1: Comprehensive Time Series Dashboard\n",
|
|
"# Create a complete time series analysis dashboard that includes:\n",
|
|
"# - Multiple time series metrics and KPIs\n",
|
|
"# - Seasonal analysis and trend identification\n",
|
|
"# - Anomaly detection and alerting\n",
|
|
"# - Forecasting with confidence intervals\n",
|
|
"# - Business insights and recommendations\n",
|
|
"\n",
|
|
"def create_time_series_dashboard(df):\n",
|
|
" \"\"\"Create comprehensive time series analysis dashboard\"\"\"\n",
|
|
" # Your implementation here\n",
|
|
" pass\n",
|
|
"\n",
|
|
"# dashboard = create_time_series_dashboard(ts_data)\n",
|
|
"# print(\"Time Series Dashboard Created\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Exercise 2: Multi-variate Time Series Analysis\n",
|
|
"# Analyze relationships between multiple time series:\n",
|
|
"# - Cross-correlation analysis\n",
|
|
"# - Lead-lag relationships\n",
|
|
"# - Causality testing\n",
|
|
"# - Multi-variate forecasting\n",
|
|
"\n",
|
|
"# Your code here:\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Exercise 3: Advanced Forecasting Challenge\n",
|
|
"# Implement more sophisticated forecasting methods:\n",
|
|
"# - Exponential smoothing with trend and seasonality\n",
|
|
"# - ARIMA modeling\n",
|
|
"# - Model evaluation and selection\n",
|
|
"# - Forecast accuracy metrics\n",
|
|
"\n",
|
|
"# Your code here:\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Key Takeaways\n",
|
|
"\n",
|
|
"1. **DateTime Indexing**:\n",
|
|
" - Use `pd.DatetimeIndex` for time-based operations\n",
|
|
" - Enable powerful time-based selection and slicing\n",
|
|
" - Extract components (year, month, day, etc.) for analysis\n",
|
|
"\n",
|
|
"2. **Resampling**:\n",
|
|
" - **`.resample()`**: Convert between different frequencies\n",
|
|
" - **Downsampling**: Aggregate to lower frequency (daily → monthly)\n",
|
|
" - **Upsampling**: Convert to higher frequency (monthly → daily)\n",
|
|
" - Use appropriate aggregation functions for your data\n",
|
|
"\n",
|
|
"3. **Rolling Calculations**:\n",
|
|
" - **`.rolling()`**: Moving window calculations\n",
|
|
" - **`.ewm()`**: Exponentially weighted functions\n",
|
|
" - Useful for smoothing and trend analysis\n",
|
|
" - Handle missing values appropriately\n",
|
|
"\n",
|
|
"4. **Seasonal Analysis**:\n",
|
|
" - Identify patterns by time components\n",
|
|
" - Decompose into trend, seasonal, and residual\n",
|
|
" - Measure seasonal strength and variability\n",
|
|
"\n",
|
|
"## Time Series Quick Reference\n",
|
|
"\n",
|
|
"```python\n",
|
|
"# Create datetime index\n",
|
|
"df.set_index(pd.to_datetime(df['date']), inplace=True)\n",
|
|
"\n",
|
|
"# Time-based selection\n",
|
|
"df['2023'] # Select year\n",
|
|
"df['2023-01'] # Select month\n",
|
|
"df['2023-01-01':'2023-01-31'] # Date range\n",
|
|
"\n",
|
|
"# Resampling\n",
|
|
"df.resample('M').sum() # Monthly sum\n",
|
|
"df.resample('W').mean() # Weekly average\n",
|
|
"df.resample('Q').agg({'col': ['sum', 'mean']}) # Quarterly multi-agg\n",
|
|
"\n",
|
|
"# Rolling calculations\n",
|
|
"df['col'].rolling(7).mean() # 7-period moving average\n",
|
|
"df['col'].ewm(span=10).mean() # Exponential moving average\n",
|
|
"df['col'].rolling(30).std() # 30-period rolling standard deviation\n",
|
|
"```\n",
|
|
"\n",
|
|
"## Business Applications\n",
|
|
"\n",
|
|
"| Use Case | Technique | Key Insights |\n",
|
|
"|----------|-----------|-------------|\n",
|
|
"| Sales forecasting | Seasonal decomposition + trends | Predict future performance |\n",
|
|
"| Anomaly detection | Rolling statistics + thresholds | Identify unusual patterns |\n",
|
|
"| Performance monitoring | Moving averages + KPIs | Track business health |\n",
|
|
"| Seasonal planning | Seasonal analysis | Optimize inventory/staffing |\n",
|
|
"| Marketing ROI | Cross-correlation analysis | Measure campaign effectiveness |\n",
|
|
"\n",
|
|
"## Best Practices\n",
|
|
"\n",
|
|
"1. **Data Quality**: Ensure consistent time intervals and handle missing data\n",
|
|
"2. **Frequency Choice**: Choose appropriate resampling frequency for your analysis\n",
|
|
"3. **Window Size**: Balance responsiveness vs. smoothness in rolling calculations\n",
|
|
"4. **Seasonality**: Always check for and account for seasonal patterns\n",
|
|
"5. **Validation**: Use holdout periods to validate forecasting models\n",
|
|
"6. **Business Context**: Interpret results in context of business cycles and events\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.13.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|