import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import chi2_contingency, norm, zscore
from scipy.stats import skew, kurtosis
from scipy.stats import ttest_ind, mannwhitneyu, shapiro

import statsmodels.api as sm


control_df = pd.read_csv('control_group.csv')
test_df = pd.read_csv('test_group.csv')


control_df.head()


control_df = pd.read_csv('control_group.csv', sep =";")
test_df = pd.read_csv('test_group.csv', sep =";")


control_df.head()


control_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   Campaign Name        30 non-null     object
 1   Date                 30 non-null     object
 2   Spend [USD]          30 non-null     int64
 3   # of Impressions     29 non-null     float64
 4   Reach                29 non-null     float64
 5   # of Website Clicks  29 non-null     float64
 6   # of Searches        29 non-null     float64
 7   # of View Content    29 non-null     float64
 8   # of Add to Cart     29 non-null     float64
 9   # of Purchase        29 non-null     float64
dtypes: float64(7), int64(1), object(2)
memory usage: 2.5+ KB


test_df.head()


test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   Campaign Name        30 non-null     object
 1   Date                 30 non-null     object
 2   Spend [USD]          30 non-null     int64
 3   # of Impressions     30 non-null     int64
 4   Reach                30 non-null     int64
 5   # of Website Clicks  30 non-null     int64
 6   # of Searches        30 non-null     int64
 7   # of View Content    30 non-null     int64
 8   # of Add to Cart     30 non-null     int64
 9   # of Purchase        30 non-null     int64
dtypes: int64(8), object(2)
memory usage: 2.5+ KB


# Drop not needed columns
control_df.drop(columns=['Campaign Name'], inplace=True)
test_df.drop(columns=['Campaign Name'], inplace=True)


column_names = ['Date', 'Spent', 'Impressions',
                'Reach', 'Clicks', 'Searches',
                'Views', 'Add to Cart', 'Purchases']
control_df.columns = column_names
test_df.columns = column_names


control_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Date         30 non-null     object
 1   Spent        30 non-null     int64
 2   Impressions  29 non-null     float64
 3   Reach        29 non-null     float64
 4   Clicks       29 non-null     float64
 5   Searches     29 non-null     float64
 6   Views        29 non-null     float64
 7   Add to Cart  29 non-null     float64
 8   Purchases    29 non-null     float64
dtypes: float64(7), int64(1), object(1)
memory usage: 2.2+ KB


# Impute the missing values in columns 2-8
control_df.iloc[:, 2:] = control_df.iloc[:, 2:].fillna(control_df.iloc[:, 2:].median())

control_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Date         30 non-null     object
 1   Spent        30 non-null     int64
 2   Impressions  30 non-null     float64
 3   Reach        30 non-null     float64
 4   Clicks       30 non-null     float64
 5   Searches     30 non-null     float64
 6   Views        30 non-null     float64
 7   Add to Cart  30 non-null     float64
 8   Purchases    30 non-null     float64
dtypes: float64(7), int64(1), object(1)
memory usage: 2.2+ KB


# Calculate CPA for each day
control_df['CPP'] = control_df['Spent'].astype(float) / control_df['Purchases']
test_df['CPP'] = test_df['Spent'].astype(float)  / test_df['Purchases']


# Visualize distributions
plt.figure(figsize=(10, 6))
sns.histplot(test_df['CPP'], kde=True, label='Test', color='orange', bins=12)
sns.histplot(control_df['CPP'], kde=True, label='Control', color='blue', bins=12, alpha=0.5)
plt.title('Distribution of CPP')
plt.xlabel('CPP (USD per Purchase)')
plt.ylabel('Frequency')
plt.legend()
plt.show()


# Check Distribution of CPP using
# Shapiro-Wilk Test for Normality
_, p_control = shapiro(control_df['CPP'])
_, p_test = shapiro(test_df['CPP'])

print(f"Shapiro-Wilk p-value (Control): {p_control}")
print(f"Shapiro-Wilk p-value (Test): {p_test}")

Shapiro-Wilk p-value (Control): 0.011892481707036495
Shapiro-Wilk p-value (Test): 0.006934247445315123


# Perform the test
stat, p_value = mannwhitneyu(control_df['CPP'].dropna(), test_df['CPP'].dropna(), alternative='two-sided')

print(f"Mann-Whitney U Statistic: {stat}")
print(f"P-value: {p_value}")

# Interpret the results
if p_value < 0.05:
    print("Reject the null hypothesis: There is a statistically significant difference in CPA between the two groups.")
else:
    print("Fail to reject the null hypothesis: There is no statistically significant difference in CPA between the two groups.")

# Step 2: Visualize CPA Comparison
plt.figure(figsize=(8, 6))
sns.boxplot(x='Group', y='CPP', data=pd.concat([
    control_df.assign(Group='Control'),
    test_df.assign(Group='Test')
]))
plt.title('Comparison of CPP: Control vs. Test')
plt.ylabel('CPA (USD per Purchase)')
plt.show()

Mann-Whitney U Statistic: 369.0
P-value: 0.2339889162810581
Fail to reject the null hypothesis: There is no statistically significant difference in CPA between the two groups.


# Calculate CR and CTR for each day
control_df['CR'] = control_df['Purchases']/ control_df['Clicks']
test_df['CR'] = test_df['Purchases']/ test_df['Clicks']

control_df['CTR'] = control_df['Clicks'] / control_df['Impressions']
test_df['CTR'] = test_df['Clicks'] / test_df['Impressions']


# Shapiro-Wilk Test for Normality
_, p_cr_control = shapiro(control_df['CR'])
_, p_cr_test = shapiro(test_df['CR'])
_, p_ctr_control = shapiro(control_df['CTR'])
_, p_ctr_test = shapiro(test_df['CTR'])

print(f"Shapiro-Wilk p-value (CR Control): {p_cr_control}")
print(f"Shapiro-Wilk p-value (CR Test): {p_cr_test}")
print(f"Shapiro-Wilk p-value (CTR Control): {p_ctr_control}")
print(f"Shapiro-Wilk p-value (CTR Test): {p_ctr_test}")

Shapiro-Wilk p-value (CR Control): 0.005544630810618401
Shapiro-Wilk p-value (CR Test): 0.037268370389938354
Shapiro-Wilk p-value (CTR Control): 0.25710153579711914
Shapiro-Wilk p-value (CTR Test): 0.00040253173210658133


combined_df=pd.concat([
    control_df.assign(Group='Control Campaign'),
    test_df.assign(Group='Test Campaign')
])
# Reset the index of combined_df to avoid duplicate labels
combined_df = combined_df.reset_index(drop=True)


# Perform Mann-Whitney U Test for CR
stat_cr, p_value_cr = mannwhitneyu(control_df['CR'], test_df['CR'], alternative='two-sided')
print(f"Mann-Whitney U Statistic (CR): {stat_cr}")
print(f"P-value (CR): {p_value_cr}")

# Perform Mann-Whitney U Test for CTR
stat_ctr, p_value_ctr = mannwhitneyu(control_df['CTR'], test_df['CTR'], alternative='two-sided')
print(f"Mann-Whitney U Statistic (CTR): {stat_ctr}")
print(f"P-value (CTR): {p_value_ctr}")


# Create subplots
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 4))
fig.suptitle('Violin Plots of CR and CTR',fontweight='bold')

# Violin plot for CR
sns.violinplot(x='Group', y='CR', data=pd.concat([
    control_df.assign(Group='Control'),
    test_df.assign(Group='Test')
]), ax=ax1)
ax1.set_title('Conversion Rate (CR): Control vs. Test')
ax1.set_ylabel('Conversion Rate (%)')

# Violin plot for CTR
sns.violinplot(x='Group', y='CTR', data=pd.concat([
    control_df.assign(Group='Control'),
    test_df.assign(Group='Test')
]), ax=ax2)
ax2.set_title('Click-Through Rate (CTR): Control vs. Test')
ax2.set_ylabel('Click-Through Rate (%)')

# Adjust layout
plt.tight_layout()
plt.show()

Mann-Whitney U Statistic (CR): 523.0
P-value (CR): 0.2837780479456242
Mann-Whitney U Statistic (CTR): 197.0
P-value (CTR): 0.00018916193602108462


plt.figure(figsize=(14, 6))

# Subplot 1: Distribution of CR
plt.subplot(1, 2, 1)
sns.histplot(data=combined_df, x='CR', hue='Group', kde=True, palette=['orange','blue'], bins=18, alpha=0.6)
plt.title('Distribution of Conversion Rate (CR)', fontsize=14)
plt.xlabel('Conversion Rate (CR)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.legend(title='Campaign', fontsize=10, labels=['Test','Control'])

# Subplot 2: Distribution of CTR
plt.subplot(1, 2, 2)
sns.histplot(data=combined_df, x='CTR', hue='Group', kde=True, palette=['orange','blue'], bins=18, alpha=0.6)
plt.title('Distribution of Click-Through Rate (CTR)', fontsize=14)
plt.xlabel('Click-Through Rate (CTR)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.legend(title='Campaign', fontsize=10,labels=['Test','Control'])

# Adjust layout
plt.tight_layout()
plt.show()

	Campaign Name;Date;Spend [USD];# of Impressions;Reach;# of Website Clicks;# of Searches;# of View Content;# of Add to Cart;# of Purchase
0	Control Campaign;1.08.2019;2280;82702;56930;70...
1	Control Campaign;2.08.2019;1757;121040;102513;...
2	Control Campaign;3.08.2019;2343;131711;110862;...
3	Control Campaign;4.08.2019;1940;72878;61235;30...
4	Control Campaign;5.08.2019;1835;;;;;;;

	Campaign Name	Date	Spend [USD]	# of Impressions	Reach	# of Website Clicks	# of Searches	# of View Content	# of Add to Cart	# of Purchase
0	Control Campaign	1.08.2019	2280	82702.0	56930.0	7016.0	2290.0	2159.0	1819.0	618.0
1	Control Campaign	2.08.2019	1757	121040.0	102513.0	8110.0	2033.0	1841.0	1219.0	511.0
2	Control Campaign	3.08.2019	2343	131711.0	110862.0	6508.0	1737.0	1549.0	1134.0	372.0
3	Control Campaign	4.08.2019	1940	72878.0	61235.0	3065.0	1042.0	982.0	1183.0	340.0
4	Control Campaign	5.08.2019	1835	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Campaign Name	Date	Spend [USD]	# of Impressions	Reach	# of Website Clicks	# of Searches	# of View Content	# of Add to Cart	# of Purchase
0	Test Campaign	1.08.2019	3008	39550	35820	3038	1946	1069	894	255
1	Test Campaign	2.08.2019	2542	100719	91236	4657	2359	1548	879	677
2	Test Campaign	3.08.2019	2365	70263	45198	7885	2572	2367	1268	578
3	Test Campaign	4.08.2019	2710	78451	25937	4216	2216	1437	566	340
4	Test Campaign	5.08.2019	2297	114295	95138	5863	2106	858	956	768

Conversion Rate (CR)	Click-Through Rate (CTR)
Results:	Results:
- Mann-Whitney U Statistic (CR): 523.0	- Mann-Whitney U Statistic (CTR): 197.0
- P-value (CR): 0.2838	- P-value (CTR): 0.00019

Interpretation:	Interpretation:
- The p-value (0.2838) is greater than 0.05.	- The p-value (0.00019) is less than 0.05.
- This means we fail to reject the null hypothesis.	- This means we reject the null hypothesis.

Conclusion:	Conclusion:
- There is no statistically significant difference in the Conversion Rate (CR) between the Control and Test groups.	- There is a statistically significant difference in the Click-Through Rate (CTR) between the Control and Test groups.

1. Load and Preprocess the Data¶

2. Metrics¶

Cost Per Purchase (CPP)¶

Why CPP is Important¶

Efficiency Metric:¶

ROI Calculation:¶

Campaign Optimization:¶

Explanation of Results¶

Interpretation of Results¶

Explore Other Metrics¶

CR and CTR¶

CR and CTR: Hypothesis Formulation¶

For Conversion Rate (CR):¶

For Click-Through Rate (CTR):¶

Interpretation of Shapiro-Wilk p-values¶

Implications for Statistical Testing¶

Key Insights¶

Next Steps¶