作者 by 超米 / 2024-05-14 / 暂无评论 / 47 个足迹
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
print("物联网工程2201,220223433,米热地力·买买提")
iris_dataset = load_iris()
print("特征名:", iris_dataset["feature_names"])
print("数据类型:", type(iris_dataset["data"]))
print("数据维度:", iris_dataset["data"].shape)
print("标记名:", iris_dataset["target_names"])
print("标记类型:", type(iris_dataset["target_names"]))
print("标记维度:", iris_dataset["target_names"].shape)
print("标记值为0的有:", len(np.where(iris_dataset.target == 0)[0]))
print("标记值为1的有:", len(np.where(iris_dataset.target == 1)[0]))
print("标记值为2的有:", len(np.where(iris_dataset.target == 2)[0]))
print(iris_dataset.DESCR)
iris_dataframe = pd.DataFrame(iris_dataset.data, columns=iris_dataset.feature_names)
print(iris_dataframe.notnull())
X_train, X_test, y_train, y_test \
= train_test_split(iris_dataframe, iris_dataset.target,
test_size=0.2, random_state=0) # 随机划分数据集
print("X_train shape", X_train.shape) # 训练集
print("X_test shape", X_test.shape) # 测试集
iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names)
grr = pd.plotting.scatter_matrix(iris_dataframe, c=y_train, figsize=None, marker='.',
hist_kwds=None, s=60, alpha=0.5)
plt.show()
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
predict_y = knn.predict(X_test)
print("测试集大小:", X_test.shape)
print("真实类别:", y_test)
print("预测类别:", predict_y)
print("比较结果", predict_y == y_test)
# 显示预测准确率
print("预测准确率:", knn.score(X_test, y_test))
#coding=utf-8
import numpy as np
from os import listdir
def loadDataSet():
# 获取训练数据集
print("物联网工程2201,220223433,米热地力·买买提")
print("1.Loading trainSet...")
trainFileList = listdir(r'C:\Users\Administrator\Downloads\实验一:数据集\HWdigits\trainSet')
trainNum = len(trainFileList) # 计算训练样本数量
trainX = np.zeros((trainNum, 32*32))
trainY = []
for i in range(trainNum):
trainFile = trainFileList[i]
# 将训练数据集向量化
trainX[i,:] = img2vector('HWdigits/trainSet/%s' % trainFile, 32, 32)
label = int(trainFile.split('_')[0])
trainY.append(label)
# 获取测试数据集
print("2.Loading testSet...")
testFileList = listdir(r'C:\Users\Administrator\Downloads\实验一:数据集\HWdigits\testSet')
testNum = len(testFileList)
testX = np.zeros((testNum, 32*32))
testY = []
for i in range(testNum):
testFile = testFileList[i]
# 将测试数据集向量化
testX[i,:] = img2vector('HWdigits/testSet/%s' % testFile, 32, 32)
label = int(testFile.split('_')[0]) # 读取文件名的第一位标记
testY.append(label)
return trainX, trainY, testX, testY
def img2vector(filename, h, w): # 将32*32的文本转为向量
imgVector = np.zeros((1, h * w))
try:
fileIn = open(filename)
for row in range(h):
lineStr = fileIn.readline()
for col in range(w):
imgVector[0, row * 32 + col] = int(lineStr[col])
return imgVector
except FileNotFoundError:
print("File not found:", filename)
return None
def myKNN(testDigit, trainX, trainY, k):
numSamples = trainX.shape[0] # shape[0]代表行,每行一个图片,得到样本个数
# 1.计算曼哈顿距离
diff = np.abs(testDigit - trainX) # 每个个体差
distance = np.sum(diff, axis=1)
# 2.按距离进行排序
sortedDistIndices = np.argsort(distance)
classCount = {} # 存放各类别的个体数量
for i in range(k):
# 3.按顺序读取标签
voteLabel = trainY[sortedDistIndices[i]]
# 4.计算该标签次数
classCount[voteLabel] = classCount.get(voteLabel, 0) + 1
# 5.查找出现次数最多的类别,作为分类结果
maxCount = 0
for key, value in classCount.items():
if value > maxCount:
maxCount = value
maxIndex = key
return maxIndex
train_x, train_y, test_x, test_y = loadDataSet()
numTestSamples = test_x.shape[0]
matchCount = 0
print("3.Find the most frequent label in k-nearest...")
print("4.Show the result...")
for i in range(numTestSamples):
predict = myKNN(test_x[i], train_x, train_y, 3)
print("result is: %d, real answer is: %d"%(predict,test_y[i]))
if predict == test_y[i]:
matchCount += 1
accuracy = float(matchCount) / numTestSamples
# 5.输出结果
print("5.Show the accuracy..,")
print("The total number of errors is:%d"%(numTestSamples-matchCount))
print(' The classify accuracy is: %.2f%%'%(accuracy *100))
独特见解