import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency, norm, zscore
from scipy.stats import skew, kurtosis
from scipy.stats import ttest_ind, mannwhitneyu, shapiro
import statsmodels.api as sm
control_df = pd.read_csv('control_group.csv')
test_df = pd.read_csv('test_group.csv')
control_df.head()
Campaign Name;Date;Spend [USD];# of Impressions;Reach;# of Website Clicks;# of Searches;# of View Content;# of Add to Cart;# of Purchase | |
---|---|
0 | Control Campaign;1.08.2019;2280;82702;56930;70... |
1 | Control Campaign;2.08.2019;1757;121040;102513;... |
2 | Control Campaign;3.08.2019;2343;131711;110862;... |
3 | Control Campaign;4.08.2019;1940;72878;61235;30... |
4 | Control Campaign;5.08.2019;1835;;;;;;; |
Apparently the data is in a single comma-separated column
control_df = pd.read_csv('control_group.csv', sep =";")
test_df = pd.read_csv('test_group.csv', sep =";")
control_df.head()
Campaign Name | Date | Spend [USD] | # of Impressions | Reach | # of Website Clicks | # of Searches | # of View Content | # of Add to Cart | # of Purchase | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Control Campaign | 1.08.2019 | 2280 | 82702.0 | 56930.0 | 7016.0 | 2290.0 | 2159.0 | 1819.0 | 618.0 |
1 | Control Campaign | 2.08.2019 | 1757 | 121040.0 | 102513.0 | 8110.0 | 2033.0 | 1841.0 | 1219.0 | 511.0 |
2 | Control Campaign | 3.08.2019 | 2343 | 131711.0 | 110862.0 | 6508.0 | 1737.0 | 1549.0 | 1134.0 | 372.0 |
3 | Control Campaign | 4.08.2019 | 1940 | 72878.0 | 61235.0 | 3065.0 | 1042.0 | 982.0 | 1183.0 | 340.0 |
4 | Control Campaign | 5.08.2019 | 1835 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
control_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 30 entries, 0 to 29 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Campaign Name 30 non-null object 1 Date 30 non-null object 2 Spend [USD] 30 non-null int64 3 # of Impressions 29 non-null float64 4 Reach 29 non-null float64 5 # of Website Clicks 29 non-null float64 6 # of Searches 29 non-null float64 7 # of View Content 29 non-null float64 8 # of Add to Cart 29 non-null float64 9 # of Purchase 29 non-null float64 dtypes: float64(7), int64(1), object(2) memory usage: 2.5+ KB
test_df.head()
Campaign Name | Date | Spend [USD] | # of Impressions | Reach | # of Website Clicks | # of Searches | # of View Content | # of Add to Cart | # of Purchase | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Test Campaign | 1.08.2019 | 3008 | 39550 | 35820 | 3038 | 1946 | 1069 | 894 | 255 |
1 | Test Campaign | 2.08.2019 | 2542 | 100719 | 91236 | 4657 | 2359 | 1548 | 879 | 677 |
2 | Test Campaign | 3.08.2019 | 2365 | 70263 | 45198 | 7885 | 2572 | 2367 | 1268 | 578 |
3 | Test Campaign | 4.08.2019 | 2710 | 78451 | 25937 | 4216 | 2216 | 1437 | 566 | 340 |
4 | Test Campaign | 5.08.2019 | 2297 | 114295 | 95138 | 5863 | 2106 | 858 | 956 | 768 |
test_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 30 entries, 0 to 29 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Campaign Name 30 non-null object 1 Date 30 non-null object 2 Spend [USD] 30 non-null int64 3 # of Impressions 30 non-null int64 4 Reach 30 non-null int64 5 # of Website Clicks 30 non-null int64 6 # of Searches 30 non-null int64 7 # of View Content 30 non-null int64 8 # of Add to Cart 30 non-null int64 9 # of Purchase 30 non-null int64 dtypes: int64(8), object(2) memory usage: 2.5+ KB
# Drop not needed columns
control_df.drop(columns=['Campaign Name'], inplace=True)
test_df.drop(columns=['Campaign Name'], inplace=True)
column_names = ['Date', 'Spent', 'Impressions',
'Reach', 'Clicks', 'Searches',
'Views', 'Add to Cart', 'Purchases']
control_df.columns = column_names
test_df.columns = column_names
control_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 30 entries, 0 to 29 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 30 non-null object 1 Spent 30 non-null int64 2 Impressions 29 non-null float64 3 Reach 29 non-null float64 4 Clicks 29 non-null float64 5 Searches 29 non-null float64 6 Views 29 non-null float64 7 Add to Cart 29 non-null float64 8 Purchases 29 non-null float64 dtypes: float64(7), int64(1), object(1) memory usage: 2.2+ KB
# Impute the missing values in columns 2-8
control_df.iloc[:, 2:] = control_df.iloc[:, 2:].fillna(control_df.iloc[:, 2:].median())
control_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 30 entries, 0 to 29 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 30 non-null object 1 Spent 30 non-null int64 2 Impressions 30 non-null float64 3 Reach 30 non-null float64 4 Clicks 30 non-null float64 5 Searches 30 non-null float64 6 Views 30 non-null float64 7 Add to Cart 30 non-null float64 8 Purchases 30 non-null float64 dtypes: float64(7), int64(1), object(1) memory usage: 2.2+ KB
Definition:
The average amount of money spent on a campaign to generate one purchase.
Formula:
It is calculated as:
$ \begin{align} \text{CPP} &= \frac{\text{Total Campaign Spend}}{\text{Total Number of Purchases}} \end{align} $
# Calculate CPA for each day
control_df['CPP'] = control_df['Spent'].astype(float) / control_df['Purchases']
test_df['CPP'] = test_df['Spent'].astype(float) / test_df['Purchases']
# Visualize distributions
plt.figure(figsize=(10, 6))
sns.histplot(test_df['CPP'], kde=True, label='Test', color='orange', bins=12)
sns.histplot(control_df['CPP'], kde=True, label='Control', color='blue', bins=12, alpha=0.5)
plt.title('Distribution of CPP')
plt.xlabel('CPP (USD per Purchase)')
plt.ylabel('Frequency')
plt.legend()
plt.show()
# Check Distribution of CPP using
# Shapiro-Wilk Test for Normality
_, p_control = shapiro(control_df['CPP'])
_, p_test = shapiro(test_df['CPP'])
print(f"Shapiro-Wilk p-value (Control): {p_control}")
print(f"Shapiro-Wilk p-value (Test): {p_test}")
Shapiro-Wilk p-value (Control): 0.011892481707036495 Shapiro-Wilk p-value (Test): 0.006934247445315123
Since the Shapiro-Wilk p-values for both the Control and Test groups are less than 0.05, we can conclude that the CPA (Cost Per Acquisition) data for both groups is not normally distributed. This means we should use a non-parametric test (e.g., Mann-Whitney U Test) instead of a parametric test like the T-Test.
# Perform the test
stat, p_value = mannwhitneyu(control_df['CPP'].dropna(), test_df['CPP'].dropna(), alternative='two-sided')
print(f"Mann-Whitney U Statistic: {stat}")
print(f"P-value: {p_value}")
# Interpret the results
if p_value < 0.05:
print("Reject the null hypothesis: There is a statistically significant difference in CPA between the two groups.")
else:
print("Fail to reject the null hypothesis: There is no statistically significant difference in CPA between the two groups.")
# Step 2: Visualize CPA Comparison
plt.figure(figsize=(8, 6))
sns.boxplot(x='Group', y='CPP', data=pd.concat([
control_df.assign(Group='Control'),
test_df.assign(Group='Test')
]))
plt.title('Comparison of CPP: Control vs. Test')
plt.ylabel('CPA (USD per Purchase)')
plt.show()
Mann-Whitney U Statistic: 369.0 P-value: 0.2339889162810581 Fail to reject the null hypothesis: There is no statistically significant difference in CPA between the two groups.
Mann-Whitney U Test:
Interpretation:
The results of the Mann-Whitney U Test indicate that the p-value is 0.234, which is greater than 0.05. This means we fail to reject the null hypothesis, and we conclude that there is no statistically significant difference in CPA (Cost Per Acquisition) between the Control and Test groups.
No Significant Difference:
Implications:
Analyze other key performance indicators (KPIs) such as:
Conversion Rate: $ \begin{align} \text{Conversion Rate} = \frac{\text{Number of Purchases}}{\text{Number of Website Clicks}} \end{align} $
Click-Through Rate (CTR): $ \begin{align} \text{CTR} = \frac{\text{Number of Clicks}}{\text{Number of Impressions}} \end{align} $
Return on Investment (ROI): $ \begin{align} \text{ROI} = \frac{\text{Revenue - Campaign Spend}}{\text{Campaign Spend}} \end{align} $
Customer Lifetime Value (CLV): $ \begin{align} \text{CLV} = \text{Average Purchase Value} \times \text{Purchase Frequency} \times \text{Customer Lifespan} \end{align} $
There is currentry data only for CR and CTR
# Calculate CR and CTR for each day
control_df['CR'] = control_df['Purchases']/ control_df['Clicks']
test_df['CR'] = test_df['Purchases']/ test_df['Clicks']
control_df['CTR'] = control_df['Clicks'] / control_df['Impressions']
test_df['CTR'] = test_df['Clicks'] / test_df['Impressions']
Null Hypothesis (H₀):
Alternative Hypothesis (H₁):
Null Hypothesis (H₀):
Alternative Hypothesis (H₁):
# Shapiro-Wilk Test for Normality
_, p_cr_control = shapiro(control_df['CR'])
_, p_cr_test = shapiro(test_df['CR'])
_, p_ctr_control = shapiro(control_df['CTR'])
_, p_ctr_test = shapiro(test_df['CTR'])
print(f"Shapiro-Wilk p-value (CR Control): {p_cr_control}")
print(f"Shapiro-Wilk p-value (CR Test): {p_cr_test}")
print(f"Shapiro-Wilk p-value (CTR Control): {p_ctr_control}")
print(f"Shapiro-Wilk p-value (CTR Test): {p_ctr_test}")
Shapiro-Wilk p-value (CR Control): 0.005544630810618401 Shapiro-Wilk p-value (CR Test): 0.037268370389938354 Shapiro-Wilk p-value (CTR Control): 0.25710153579711914 Shapiro-Wilk p-value (CTR Test): 0.00040253173210658133
combined_df=pd.concat([
control_df.assign(Group='Control Campaign'),
test_df.assign(Group='Test Campaign')
])
# Reset the index of combined_df to avoid duplicate labels
combined_df = combined_df.reset_index(drop=True)
CR Control:
CR Test:
CTR Control:
CTR Test:
Conversion Rate (CR):
Click-Through Rate (CTR):
# Perform Mann-Whitney U Test for CR
stat_cr, p_value_cr = mannwhitneyu(control_df['CR'], test_df['CR'], alternative='two-sided')
print(f"Mann-Whitney U Statistic (CR): {stat_cr}")
print(f"P-value (CR): {p_value_cr}")
# Perform Mann-Whitney U Test for CTR
stat_ctr, p_value_ctr = mannwhitneyu(control_df['CTR'], test_df['CTR'], alternative='two-sided')
print(f"Mann-Whitney U Statistic (CTR): {stat_ctr}")
print(f"P-value (CTR): {p_value_ctr}")
# Create subplots
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 4))
fig.suptitle('Violin Plots of CR and CTR',fontweight='bold')
# Violin plot for CR
sns.violinplot(x='Group', y='CR', data=pd.concat([
control_df.assign(Group='Control'),
test_df.assign(Group='Test')
]), ax=ax1)
ax1.set_title('Conversion Rate (CR): Control vs. Test')
ax1.set_ylabel('Conversion Rate (%)')
# Violin plot for CTR
sns.violinplot(x='Group', y='CTR', data=pd.concat([
control_df.assign(Group='Control'),
test_df.assign(Group='Test')
]), ax=ax2)
ax2.set_title('Click-Through Rate (CTR): Control vs. Test')
ax2.set_ylabel('Click-Through Rate (%)')
# Adjust layout
plt.tight_layout()
plt.show()
Mann-Whitney U Statistic (CR): 523.0 P-value (CR): 0.2837780479456242 Mann-Whitney U Statistic (CTR): 197.0 P-value (CTR): 0.00018916193602108462
plt.figure(figsize=(14, 6))
# Subplot 1: Distribution of CR
plt.subplot(1, 2, 1)
sns.histplot(data=combined_df, x='CR', hue='Group', kde=True, palette=['orange','blue'], bins=18, alpha=0.6)
plt.title('Distribution of Conversion Rate (CR)', fontsize=14)
plt.xlabel('Conversion Rate (CR)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.legend(title='Campaign', fontsize=10, labels=['Test','Control'])
# Subplot 2: Distribution of CTR
plt.subplot(1, 2, 2)
sns.histplot(data=combined_df, x='CTR', hue='Group', kde=True, palette=['orange','blue'], bins=18, alpha=0.6)
plt.title('Distribution of Click-Through Rate (CTR)', fontsize=14)
plt.xlabel('Click-Through Rate (CTR)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.legend(title='Campaign', fontsize=10,labels=['Test','Control'])
# Adjust layout
plt.tight_layout()
plt.show()
Conversion Rate (CR) | Click-Through Rate (CTR) |
---|---|
Results: | Results: |
- Mann-Whitney U Statistic (CR): 523.0 | - Mann-Whitney U Statistic (CTR): 197.0 |
- P-value (CR): 0.2838 | - P-value (CTR): 0.00019 |
Interpretation: | Interpretation: |
- The p-value (0.2838) is greater than 0.05. | - The p-value (0.00019) is less than 0.05. |
- This means we fail to reject the null hypothesis. | - This means we reject the null hypothesis. |
Conclusion: | Conclusion: |
- There is no statistically significant difference in the Conversion Rate (CR) between the Control and Test groups. | - There is a statistically significant difference in the Click-Through Rate (CTR) between the Control and Test groups. |
Conversion Rate (CR):
Click-Through Rate (CTR):
For CTR:
For CR:
Further Analysis: