# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("Iris.csv")  # Ensure the file is in the same directory or provide the correct path

# Display first few rows
print("First 5 rows of the dataset:")
print(df.head())

# Shape of the dataset
print("\nDataset shape (rows, columns):", df.shape)

# Dataset info
print("\nData Types and Null Counts:")
print(df.info())

# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

# Statistical summary
print("\nSummary statistics:")
print(df.describe())

First 5 rows of the dataset:
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa

Dataset shape (rows, columns): (150, 6)

Data Types and Null Counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB
None

Missing values in each column:
Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

Summary statistics:
               Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
count  150.000000     150.000000    150.000000     150.000000    150.000000
mean    75.500000       5.843333      3.054000       3.758667      1.198667
std     43.445368       0.828066      0.433594       1.764420      0.763161
min      1.000000       4.300000      2.000000       1.000000      0.100000
25%     38.250000       5.100000      2.800000       1.600000      0.300000
50%     75.500000       5.800000      3.000000       4.350000      1.300000
75%    112.750000       6.400000      3.300000       5.100000      1.800000
max    150.000000       7.900000      4.400000       6.900000      2.500000

# Check for duplicates
print("\nNumber of duplicate rows:", df.duplicated().sum())

# Drop ID column (not useful for prediction)
df.drop(columns=["Id"], inplace=True)

# Unique classes in the target
print("\nUnique species:", df["Species"].unique())

# Count of each species
print("\nClass distribution:")
print(df["Species"].value_counts())

Number of duplicate rows: 0

Unique species: ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']

Class distribution:
Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

import matplotlib.pyplot as plt
import seaborn as sns

# BOX PLOTS
# We will draw a separate boxplot for each feature (one by one)

plt.figure(figsize=(12, 8))  # Make the figure bigger

# List of feature names (excluding 'Species')
feature_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

# First boxplot
plt.subplot(2, 2, 1)
sns.boxplot(x='Species', y='SepalLengthCm', data=df)
plt.title('Boxplot of SepalLengthCm')

# Second boxplot
plt.subplot(2, 2, 2)
sns.boxplot(x='Species', y='SepalWidthCm', data=df)
plt.title('Boxplot of SepalWidthCm')

# Third boxplot
plt.subplot(2, 2, 3)
sns.boxplot(x='Species', y='PetalLengthCm', data=df)
plt.title('Boxplot of PetalLengthCm')

# Fourth boxplot
plt.subplot(2, 2, 4)
sns.boxplot(x='Species', y='PetalWidthCm', data=df)
plt.title('Boxplot of PetalWidthCm')

plt.tight_layout()
plt.show()

# CORRELATION MATRIX
# This shows how much each feature is related to the others

plt.figure(figsize=(8, 6))
corr_matrix = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Feature Correlation Matrix")
plt.show()

# DISTRIBUTION PLOTS
# These plots show how each feature is spread for each species

plt.figure(figsize=(12, 8))

# First histogram
plt.subplot(2, 2, 1)
sns.histplot(data=df, x='SepalLengthCm', hue='Species', kde=True, palette='bright')
plt.title('Distribution of SepalLengthCm')

# Second histogram
plt.subplot(2, 2, 2)
sns.histplot(data=df, x='SepalWidthCm', hue='Species', kde=True, palette='bright')
plt.title('Distribution of SepalWidthCm')

# Third histogram
plt.subplot(2, 2, 3)
sns.histplot(data=df, x='PetalLengthCm', hue='Species', kde=True, palette='bright')
plt.title('Distribution of PetalLengthCm')

# Fourth histogram
plt.subplot(2, 2, 4)
sns.histplot(data=df, x='PetalWidthCm', hue='Species', kde=True, palette='bright')
plt.title('Distribution of PetalWidthCm')

plt.tight_layout()
plt.show()