Dataset: Sales records from multiple branches with some missing and duplicate values.
import pandas as pd
import numpy as np
data = {
'Branch': ['A', 'B', 'B', 'C', 'D', 'E', 'F', 'G'],
'Sales': [10000, 15000, 15000, 20000, None, 17000, None, 20000],
'Manager': ['Raj', 'Simran', 'Simran', 'Kunal', np.nan, 'Arjun', 'Kunal', 'Raj'],
'Region': ['East', 'West', 'West', np.nan, 'North', 'South', 'North', np.nan]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("\nMissing Values in Each Column:")
print(df.isnull().sum())
print("\nRows where Sales is Missing:")
print(df[df['Sales'].isnull()])
# Replace missing Sales with mean
mean_sales = df['Sales'].mean()
df['Sales'] = df['Sales'].fillna(mean_sales)
# Replace missing Manager with 'Unknown'
df['Manager'] = df['Manager'].fillna('Unknown')
# Drop rows with missing Region
df.dropna(subset=['Region'], inplace=True)
print("\nCleaned DataFrame:")
print(df)
# Check for duplicate rows
duplicates = df.duplicated()
print("\nDuplicate Rows:")
print(duplicates)
# Count of duplicates
print("\nTotal Duplicate Rows:", duplicates.sum())
# Drop duplicates keeping last occurrence
df = df.drop_duplicates(keep='last')
# Drop duplicates based on Branch and Manager
df_unique = df.drop_duplicates(subset=['Branch', 'Manager'])
# Rows remaining after cleaning
print("\nTotal Rows After Cleaning:", len(df_unique))
# Column with most missing values before cleaning
initial_missing = pd.DataFrame(data).isnull().sum()
most_missing_col = initial_missing.idxmax()
print("Column with Most Missing Values:", most_missing_col)
# Average Sales after filling missing values
print("Average Sales After Cleaning:", round(df['Sales'].mean(), 2))
isnull() and sum() to inspect missing data and filters rows where Sales is null.fillna() is used to fill missing Sales with the mean value using assignment to avoid chained warnings.dropna().duplicated() checks for duplicate rows.drop_duplicates() removes duplicates keeping the last entry.