In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Load the dataset
df = pd.read_csv("Iris.csv") # Ensure the file is in the same directory or provide the correct path
# Display first few rows
print("First 5 rows of the dataset:")
print(df.head())
# Shape of the dataset
print("\nDataset shape (rows, columns):", df.shape)
# Dataset info
print("\nData Types and Null Counts:")
print(df.info())
# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())
# Statistical summary
print("\nSummary statistics:")
print(df.describe())
First 5 rows of the dataset:
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 1 5.1 3.5 1.4 0.2 Iris-setosa
1 2 4.9 3.0 1.4 0.2 Iris-setosa
2 3 4.7 3.2 1.3 0.2 Iris-setosa
3 4 4.6 3.1 1.5 0.2 Iris-setosa
4 5 5.0 3.6 1.4 0.2 Iris-setosa
Dataset shape (rows, columns): (150, 6)
Data Types and Null Counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 150 non-null int64
1 SepalLengthCm 150 non-null float64
2 SepalWidthCm 150 non-null float64
3 PetalLengthCm 150 non-null float64
4 PetalWidthCm 150 non-null float64
5 Species 150 non-null object
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB
None
Missing values in each column:
Id 0
SepalLengthCm 0
SepalWidthCm 0
PetalLengthCm 0
PetalWidthCm 0
Species 0
dtype: int64
Summary statistics:
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
count 150.000000 150.000000 150.000000 150.000000 150.000000
mean 75.500000 5.843333 3.054000 3.758667 1.198667
std 43.445368 0.828066 0.433594 1.764420 0.763161
min 1.000000 4.300000 2.000000 1.000000 0.100000
25% 38.250000 5.100000 2.800000 1.600000 0.300000
50% 75.500000 5.800000 3.000000 4.350000 1.300000
75% 112.750000 6.400000 3.300000 5.100000 1.800000
max 150.000000 7.900000 4.400000 6.900000 2.500000
In [2]:
# Check for duplicates
print("\nNumber of duplicate rows:", df.duplicated().sum())
# Drop ID column (not useful for prediction)
df.drop(columns=["Id"], inplace=True)
# Unique classes in the target
print("\nUnique species:", df["Species"].unique())
# Count of each species
print("\nClass distribution:")
print(df["Species"].value_counts())
Number of duplicate rows: 0 Unique species: ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica'] Class distribution: Species Iris-setosa 50 Iris-versicolor 50 Iris-virginica 50 Name: count, dtype: int64
In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
# BOX PLOTS
# We will draw a separate boxplot for each feature (one by one)
plt.figure(figsize=(12, 8)) # Make the figure bigger
# List of feature names (excluding 'Species')
feature_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
# First boxplot
plt.subplot(2, 2, 1)
sns.boxplot(x='Species', y='SepalLengthCm', data=df)
plt.title('Boxplot of SepalLengthCm')
# Second boxplot
plt.subplot(2, 2, 2)
sns.boxplot(x='Species', y='SepalWidthCm', data=df)
plt.title('Boxplot of SepalWidthCm')
# Third boxplot
plt.subplot(2, 2, 3)
sns.boxplot(x='Species', y='PetalLengthCm', data=df)
plt.title('Boxplot of PetalLengthCm')
# Fourth boxplot
plt.subplot(2, 2, 4)
sns.boxplot(x='Species', y='PetalWidthCm', data=df)
plt.title('Boxplot of PetalWidthCm')
plt.tight_layout()
plt.show()
In [4]:
# CORRELATION MATRIX
# This shows how much each feature is related to the others
plt.figure(figsize=(8, 6))
corr_matrix = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Feature Correlation Matrix")
plt.show()
In [5]:
# DISTRIBUTION PLOTS
# These plots show how each feature is spread for each species
plt.figure(figsize=(12, 8))
# First histogram
plt.subplot(2, 2, 1)
sns.histplot(data=df, x='SepalLengthCm', hue='Species', kde=True, palette='bright')
plt.title('Distribution of SepalLengthCm')
# Second histogram
plt.subplot(2, 2, 2)
sns.histplot(data=df, x='SepalWidthCm', hue='Species', kde=True, palette='bright')
plt.title('Distribution of SepalWidthCm')
# Third histogram
plt.subplot(2, 2, 3)
sns.histplot(data=df, x='PetalLengthCm', hue='Species', kde=True, palette='bright')
plt.title('Distribution of PetalLengthCm')
# Fourth histogram
plt.subplot(2, 2, 4)
sns.histplot(data=df, x='PetalWidthCm', hue='Species', kde=True, palette='bright')
plt.title('Distribution of PetalWidthCm')
plt.tight_layout()
plt.show()