# import

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import pandas as pd

import pandas as pd

# dataset
data = {
    'Outlook': [
        'Sunny', 'Sunny', 'Overcast', 'Rainy', 'Sunny', 'Overcast', 'Rainy', 'Sunny', 'Sunny', 'Overcast',
        'Rainy', 'Rainy', 'Sunny', 'Overcast', 'Overcast', 'Sunny', 'Rainy', 'Rainy', 'Sunny', 'Sunny',
        'Overcast', 'Rainy', 'Sunny', 'Sunny', 'Overcast', 'Rainy', 'Sunny', 'Rainy', 'Overcast', 'Sunny',
        'Sunny', 'Overcast', 'Rainy', 'Sunny', 'Sunny', 'Rainy', 'Overcast', 'Sunny', 'Rainy', 'Overcast',
        'Sunny', 'Rainy', 'Overcast', 'Rainy', 'Sunny', 'Overcast', 'Rainy', 'Sunny', 'Rainy', 'Sunny'
    ],
    'Temperature': [
        'Hot', 'Mild', 'Hot', 'Mild', 'Cool', 'Mild', 'Cool', 'Hot', 'Hot', 'Hot',
        'Mild', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Hot', 'Mild', 'Cool', 'Mild',
        'Hot', 'Mild', 'Cool', 'Hot', 'Mild', 'Cool', 'Hot', 'Hot', 'Mild', 'Mild',
        'Hot', 'Cool', 'Mild', 'Cool', 'Hot', 'Hot', 'Mild', 'Mild', 'Cool', 'Cool',
        'Mild', 'Cool', 'Hot', 'Mild', 'Hot', 'Mild', 'Cool', 'Mild', 'Hot', 'Mild'
    ],
    'Play': [
        'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes',
        'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No',
        'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No',
        'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes',
        'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No'
    ]
}

df = pd.DataFrame(data)
df

from sklearn.preprocessing import LabelEncoder

# Encode string labels into numbers
le_outlook = LabelEncoder()
le_temp = LabelEncoder()
le_play = LabelEncoder()

df['Outlook'] = le_outlook.fit_transform(df['Outlook'])       # Sunny=2, Overcast=0, Rainy=1
df['Temperature'] = le_temp.fit_transform(df['Temperature'])  # Cool=0, Hot=1, Mild=2
df['Play'] = le_play.fit_transform(df['Play'])                # No=0, Yes=1

df

from sklearn.naive_bayes import CategoricalNB

# Define features (X) and label (y)
X = df[['Outlook', 'Temperature']]
y = df['Play']

# Train the model
model = CategoricalNB()
model.fit(X, y)

CategoricalNB()

CategoricalNB()

# Encode new input: Sunny, Hot
outlook_input = le_outlook.transform(['Sunny'])[0]
temp_input = le_temp.transform(['Hot'])[0]

# Use DataFrame to avoid warnings
input_df = pd.DataFrame([[outlook_input, temp_input]], columns=['Outlook', 'Temperature'])

# Predict
model.predict(input_df)

array([1])

# Convert back to class label
predicted = model.predict(input_df)
le_play.inverse_transform(predicted)

array(['Yes'], dtype=object)

# Create a DataFrame for test samples

test_samples = {
    'Outlook': ['Sunny', 'Overcast', 'Rainy', 'Sunny'],
    'Temperature': ['Mild', 'Mild', 'Mild', 'Mild']
}

test_df = pd.DataFrame(test_samples)
print(test_df)

    Outlook Temperature
0     Sunny        Mild
1  Overcast        Mild
2     Rainy        Mild
3     Sunny        Mild

# Encode using the same label encoders used during training
test_df['Outlook'] = le_outlook.transform(test_df['Outlook'])
test_df['Temperature'] = le_temp.transform(test_df['Temperature'])

print(test_df)

   Outlook  Temperature
0        2            2
1        0            2
2        1            2
3        2            2

# Predict for all test samples
predicted = model.predict(test_df)

# Decode predicted labels (0/1 → No/Yes)
decoded = le_play.inverse_transform(predicted)

# Add predictions to the DataFrame
test_samples_result = test_df.copy()
test_samples_result['Play_Predicted'] = decoded
print(test_samples_result)

   Outlook  Temperature Play_Predicted
0        2            2             No
1        0            2            Yes
2        1            2             No
3        2            2             No

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
           'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
           'hours_per_week', 'native_country', 'income']
df = pd.read_csv(url, header=None, names=columns, na_values=' ?')


print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      30718 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  31978 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None

print(df.describe())

                age        fnlwgt  education_num  capital_gain  capital_loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours_per_week  
count    32561.000000  
mean        40.437456  
std         12.347429  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000

print(df.head())

   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          2174             0              40   United-States   <=50K  
1             0             0              13   United-States   <=50K  
2             0             0              40   United-States   <=50K  
3             0             0              40   United-States   <=50K  
4             0             0              40            Cuba   <=50K

print(df.isnull().sum())

age                  0
workclass         1836
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     583
income               0
dtype: int64

print(df['income'].unique())

[' <=50K' ' >50K']

# Step 2: Preprocess the data
# Drop rows with missing values
df.dropna(inplace=True)

# Encode categorical variables
categorical_columns = df.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Define features and target
X = df.drop('income', axis=1)
y = df['income']

# Step 3: Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Train the Naïve Bayes classifier
model = CategoricalNB()
model.fit(X_train, y_train)

CategoricalNB()

CategoricalNB()

# Step 5: Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.86
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      6767
           1       0.77      0.65      0.70      2282

    accuracy                           0.86      9049
   macro avg       0.83      0.79      0.81      9049
weighted avg       0.86      0.86      0.86      9049

Naïve Bayes Classifier¶

Bayes’ Theorem¶

Naïve Assumption¶

Types of Naïve Bayes¶

What Happened¶

Adult Income Dataset from University of California Website¶

Accuracy¶

Classification Report¶

Macro and Weighted Averages¶

Summary¶

Types of Naïve Bayes Models¶

Multinomial Naïve Bayes¶

Bernoulli Naïve Bayes¶

Gaussian Naïve Bayes¶

	Outlook	Temperature	Play
0	Sunny	Hot	No
1	Sunny	Mild	No
2	Overcast	Hot	Yes
3	Rainy	Mild	Yes
4	Sunny	Cool	Yes
5	Overcast	Mild	Yes
6	Rainy	Cool	Yes
7	Sunny	Hot	No
8	Sunny	Hot	Yes
9	Overcast	Hot	Yes
10	Rainy	Mild	Yes
11	Rainy	Cool	No
12	Sunny	Cool	Yes
13	Overcast	Mild	Yes
14	Overcast	Cool	Yes
15	Sunny	Mild	No
16	Rainy	Hot	Yes
17	Rainy	Mild	No
18	Sunny	Cool	Yes
19	Sunny	Mild	No
20	Overcast	Hot	Yes
21	Rainy	Mild	No
22	Sunny	Cool	Yes
23	Sunny	Hot	No
24	Overcast	Mild	Yes
25	Rainy	Cool	No
26	Sunny	Hot	Yes
27	Rainy	Hot	No
28	Overcast	Mild	Yes
29	Sunny	Mild	No
30	Sunny	Hot	Yes
31	Overcast	Cool	Yes
32	Rainy	Mild	Yes
33	Sunny	Cool	No
34	Sunny	Hot	Yes
35	Rainy	Hot	No
36	Overcast	Mild	Yes
37	Sunny	Mild	Yes
38	Rainy	Cool	No
39	Overcast	Cool	Yes
40	Sunny	Mild	No
41	Rainy	Cool	Yes
42	Overcast	Hot	Yes
43	Rainy	Mild	No
44	Sunny	Hot	Yes
45	Overcast	Mild	Yes
46	Rainy	Cool	Yes
47	Sunny	Mild	No
48	Rainy	Hot	Yes
49	Sunny	Mild	No

	Outlook	Temperature	Play
0	2	1	0
1	2	2	0
2	0	1	1
3	1	2	1
4	2	0	1
5	0	2	1
6	1	0	1
7	2	1	0
8	2	1	1
9	0	1	1
10	1	2	1
11	1	0	0
12	2	0	1
13	0	2	1
14	0	0	1
15	2	2	0
16	1	1	1
17	1	2	0
18	2	0	1
19	2	2	0
20	0	1	1
21	1	2	0
22	2	0	1
23	2	1	0
24	0	2	1
25	1	0	0
26	2	1	1
27	1	1	0
28	0	2	1
29	2	2	0
30	2	1	1
31	0	0	1
32	1	2	1
33	2	0	0
34	2	1	1
35	1	1	0
36	0	2	1
37	2	2	1
38	1	0	0
39	0	0	1
40	2	2	0
41	1	0	1
42	0	1	1
43	1	2	0
44	2	1	1
45	0	2	1
46	1	0	1
47	2	2	0
48	1	1	1
49	2	2	0

Metric	Class 0 (<=50K)	Class 1 (>50K)
Precision	0.89	0.77
Recall	0.93	0.65
F1-score	0.91	0.70
Support	6767	2282

	Outlook	Temperature	Play
0	2	1	0
1	2	2	0
2	0	1	1
3	1	2	1
4	2	0	1
5	0	2	1
6	1	0	1
7	2	1	0
8	2	1	1
9	0	1	1
10	1	2	1
11	1	0	0
12	2	0	1
13	0	2	1
14	0	0	1
15	2	2	0
16	1	1	1
17	1	2	0
18	2	0	1
19	2	2	0
20	0	1	1
21	1	2	0
22	2	0	1
23	2	1	0
24	0	2	1
25	1	0	0
26	2	1	1
27	1	1	0
28	0	2	1
29	2	2	0
30	2	1	1
31	0	0	1
32	1	2	1
33	2	0	0
34	2	1	1
35	1	1	0
36	0	2	1
37	2	2	1
38	1	0	0
39	0	0	1
40	2	2	0
41	1	0	1
42	0	1	1
43	1	2	0
44	2	1	1
45	0	2	1
46	1	0	1
47	2	2	0
48	1	1	1
49	2	2	0

	Outlook	Temperature	Play
0	2	1	0
1	2	2	0
2	0	1	1
3	1	2	1
4	2	0	1
5	0	2	1
6	1	0	1
7	2	1	0
8	2	1	1
9	0	1	1
10	1	2	1
11	1	0	0
12	2	0	1
13	0	2	1
14	0	0	1
15	2	2	0
16	1	1	1
17	1	2	0
18	2	0	1
19	2	2	0
20	0	1	1
21	1	2	0
22	2	0	1
23	2	1	0
24	0	2	1
25	1	0	0
26	2	1	1
27	1	1	0
28	0	2	1
29	2	2	0
30	2	1	1
31	0	0	1
32	1	2	1
33	2	0	0
34	2	1	1
35	1	1	0
36	0	2	1
37	2	2	1
38	1	0	0
39	0	0	1
40	2	2	0
41	1	0	1
42	0	1	1
43	1	2	0
44	2	1	1
45	0	2	1
46	1	0	1
47	2	2	0
48	1	1	1
49	2	2	0