一、朴素贝叶斯算法的实现
naive_bayes_classifier.py
import numpy as np
import collections as cc # 集合的计数功能
from scipy.stats import norm # 极大似然估计样本的均值和标准方差
from data_bin_wrapper import DataBinsWrapper
class NaiveBayesClassifier:
"""
朴素贝叶斯分类器:对于连续属性两种方式操作,1是分箱处理,2是直接进行高斯分布的参数估计
"""
def __init__(self, is_binned=False, is_feature_all_R=False, feature_R_idx=None, max_bins=10):
self.is_binned = is_binned # 连续特征变量数据是否进行分箱操作,离散化
if is_binned:
self.is_feature_all_R = is_feature_all_R # 是否所有特征变量都是连续数值,bool
self.max_bins = max_bins # 最大分箱数
self.dbw = DataBinsWrapper() # 分箱对象
self.dbw_XrangeMap = dict() # 存储训练样本特征分箱的段点
self.feature_R_idx = feature_R_idx # 混合式数据中连续特征变量的索引
self.class_values, self.n_class = None, 0 # 类别取值以及类别数
self.prior_prob = dict() # 先验分布,键是类别取值,键是类别取值
self.classified_feature_prob = dict() # 存储每个类所对应的特征变量取值频次或者连续属性的高斯分布参数
self.feature_values_num = dict() # 训练样本中每个特征不同的取值数,针对离散数据
self.class_values_num = dict() # 目标集中每个类别的样本量,Dc
def _prior_probability(self, y_train):
"""
计算类别的先验概率
:param y_train: 目标集
:return:
"""
n_samples = len(y_train) # 总样本量
self.class_values_num = cc.Counter(y_train) # Counter({'否': 9, '是': 8})
# print(self.class_values_num)
for key in self.class_values_num.keys():
self.prior_prob[key] = (self.class_values_num[key] + 1) / (n_samples + self.n_class)
# print(self.prior_prob)
def _data_bin_wrapper(self, x_samples):
"""
针对特定的连续特征属性索引dbw_feature_idx,分别进行分箱,考虑测试样本与训练样本使用同一个XrangeMap
:param x_samples: 样本:即可以是训练样本,也可以是测试样本
:return:
"""
self.feature_R_idx = np.asarray(self.feature_R_idx)
x_samples_prop = [] # 分箱之后的数据
if not self.dbw_XrangeMap:
# 为空,即创建决策树前所做的分箱操作
for i in服务器托管网 range(x_samples.shape[1]):
if i in self.feature_R_idx: # 说明当前特征是连续数值
self.dbw.fit(x_samples[:, i])
self.dbw_XrangeMap[i] = self.dbw.XrangeMap
x_samples_prop.append(self.dbw.transform(x_samples[:, i]))
else:
x_samples_prop.append(x_samples[:, i])
else: # 针对测试样本的分箱操作
for i in range(x_samples.shape[1]):
if i in self.feature_R_idx: # 说明当前特征是连续数值
x_samples_prop.append(self.dbw.transform(x_samples[:, i], self.dbw_XrangeMap[i]))
else:
x_samples_prop.append(x_samples[:, i])
return np.asarray(x_samples_prop).T
def fit(self, x_train, y_train):
"""
朴素贝叶斯分类器训练,可将朴素贝叶斯分类器涉及的所有概率估值事先计算好存储起来
:param x_train: 训练集
:param y_train: 目标集
:return:
"""
x_train, y_train = np.asarray(x_train), np.asarray(y_train)
self.class_values = np.unique(y_train) # 类别取值
self.n_class = len(self.class_values) # 类别数
if self.n_class
二、可视化分类边界函数
plt_decision_function.py
import matplotlib.pyplot as plt
import numpy as np
def plot_decision_function(X, y, clf, is_show=True):
"""
可视化分类边界函数
:param X: 测试样本
:param y: 测试样本的类别
:param clf: 分类模型
:param is_show: 是否在当前显示图像,用于父函数绘制子图
:return:
"""
if is_show:
plt.figure(figsize=(7, 5))
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xi, yi = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
y_pred = clf.predict(np.c_[xi.ravel(), yi.ravel()]) # 模型预测值
y_pred = y_pred.reshape(xi.shape)
plt.contourf(xi, yi, y_pred, cmap="winter", alpha=0.4)
plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors="k")
plt.xlabel("Feature 1", fontdict={"fontsize": 12})
plt.ylabel("Feature 2", fontdict={"fontsize": 12})
plt.title("NativeBayes Model Classification Boundary", fontdict={"fontsize": 14})
if is_show:
plt.show()
三、朴素贝叶斯算法的测试
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from naive_bayes_classifier import NaiveBayesClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from plt_decision_function import plot_decision_function
# wm = pd.read_csv("watermelon.csv").dropna()
# X, y = np.asarray(wm.iloc[:, 1:-1]), np.asarray(wm.iloc[:, -1])
# # print(X)
# # print(y)
# nbc = NaiveBayesClassifier(is_binned=True, feature_R_idx=[6, 7], max_bins=10)
# nbc.fit(X, y)
# y_proba = nbc.predict_proba(X)
# print(y_proba)
# y_hat = nbc.predict(X)
# print(y_hat)
X, y = make_blobs(n_samples=500, n_features=2, centers=4, cluster_std=0.85, random_state=0服务器托管网)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
nbc = NaiveBayesClassifier(is_binned=True, max_bins=20, is_feature_all_R=True)
nbc.fit(X_train, y_train)
y_pred = nbc.predict(X_test)
print(classification_report(y_test, y_pred))
plt.figure(figsize=(14, 5))
plt.subplot(121)
plot_decision_function(X_train, y_train, nbc, is_show=False)
nbc = NaiveBayesClassifier(is_binned=False, feature_R_idx=[0, 1])
nbc.fit(X_train, y_train)
y_pred = nbc.predict(X_test)
print(classification_report(y_test, y_pred))
plt.subplot(122)
plot_decision_function(X_train, y_train, nbc, is_show=False)
plt.show()
# al = pd.read_csv("mushroom/agaricus-lepiota.data").dropna()
服务器托管,北京服务器托管,服务器租用 http://www.fwqtg.net
相关推荐: Oracle的学习心得和知识总结(三十二)|Oracle数据库数据库回放功能之论文四翻译及学习
目录结构 注:提前言明 本文借鉴了以下博主、书籍或网站的内容,其列表如下: 1、参考书籍:《Oracle Database SQL Language Reference》 2、参考书籍:《PostgreSQL中文手册》 3、EDB Postgres Advan…