实验一:鸢尾花分类、手写数字识别

作者 by 超米 / 2024-05-14 / 暂无评论 / 47 个足迹

数据集:实验一:数据集.zip

鸢尾花分类:

 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 from sklearn.datasets import load_iris
 from sklearn.model_selection import train_test_split
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.preprocessing import StandardScaler

 print("物联网工程2201,220223433,米热地力·买买提")
 iris_dataset = load_iris()
 print("特征名:", iris_dataset["feature_names"])
 print("数据类型:", type(iris_dataset["data"]))
 print("数据维度:", iris_dataset["data"].shape)
 print("标记名:", iris_dataset["target_names"])
 print("标记类型:", type(iris_dataset["target_names"]))
     print("标记维度:", iris_dataset["target_names"].shape)
 print("标记值为0的有:", len(np.where(iris_dataset.target == 0)[0]))
 print("标记值为1的有:", len(np.where(iris_dataset.target == 1)[0]))
 print("标记值为2的有:", len(np.where(iris_dataset.target == 2)[0]))
 print(iris_dataset.DESCR)

 iris_dataframe = pd.DataFrame(iris_dataset.data, columns=iris_dataset.feature_names)
 print(iris_dataframe.notnull())

 X_train, X_test, y_train, y_test \
     = train_test_split(iris_dataframe, iris_dataset.target,
                    test_size=0.2, random_state=0)  # 随机划分数据集
 print("X_train shape", X_train.shape)  # 训练集
 print("X_test shape", X_test.shape)  # 测试集
 iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names)
 grr = pd.plotting.scatter_matrix(iris_dataframe, c=y_train, figsize=None, marker='.',
                              hist_kwds=None, s=60, alpha=0.5)

 plt.show()
 knn = KNeighborsClassifier()
 knn.fit(X_train, y_train)
 predict_y = knn.predict(X_test)
 print("测试集大小:", X_test.shape)
 print("真实类别:", y_test)
 print("预测类别:", predict_y)
 print("比较结果", predict_y == y_test)
 # 显示预测准确率
 print("预测准确率:", knn.score(X_test, y_test))

手写体识别:

#coding=utf-8
import numpy as np
from os import listdir

def loadDataSet():
    # 获取训练数据集
    print("物联网工程2201,220223433,米热地力·买买提")
    print("1.Loading trainSet...")
    trainFileList = listdir(r'C:\Users\Administrator\Downloads\实验一:数据集\HWdigits\trainSet')
    trainNum = len(trainFileList)  # 计算训练样本数量
    
    trainX = np.zeros((trainNum, 32*32))
    trainY = []
    for i in range(trainNum):
        trainFile = trainFileList[i]
        # 将训练数据集向量化
        trainX[i,:] = img2vector('HWdigits/trainSet/%s' % trainFile, 32, 32)
        label = int(trainFile.split('_')[0])
        trainY.append(label)
    # 获取测试数据集
    print("2.Loading testSet...")
    testFileList = listdir(r'C:\Users\Administrator\Downloads\实验一:数据集\HWdigits\testSet')
    testNum = len(testFileList)
    testX = np.zeros((testNum, 32*32))
    testY = []
    for i in range(testNum):
        testFile = testFileList[i]
        # 将测试数据集向量化
        testX[i,:] = img2vector('HWdigits/testSet/%s' % testFile, 32, 32)
        label = int(testFile.split('_')[0])  # 读取文件名的第一位标记
        testY.append(label)
    return trainX, trainY, testX, testY

def img2vector(filename, h, w):    # 将32*32的文本转为向量
    imgVector = np.zeros((1, h * w))
    try:
        fileIn = open(filename)
        for row in range(h):
            lineStr = fileIn.readline()
            for col in range(w):
                imgVector[0, row * 32 + col] = int(lineStr[col])
        return imgVector
    except FileNotFoundError:
        print("File not found:", filename)
        return None

def myKNN(testDigit, trainX, trainY, k):
    numSamples = trainX.shape[0]  # shape[0]代表行,每行一个图片,得到样本个数
    # 1.计算曼哈顿距离
    diff = np.abs(testDigit - trainX)  # 每个个体差
    distance = np.sum(diff, axis=1)
    # 2.按距离进行排序
    sortedDistIndices = np.argsort(distance)
    classCount = {}  # 存放各类别的个体数量
    for i in range(k):
        # 3.按顺序读取标签
        voteLabel = trainY[sortedDistIndices[i]]
        # 4.计算该标签次数
        classCount[voteLabel] = classCount.get(voteLabel, 0) + 1

    # 5.查找出现次数最多的类别,作为分类结果
    maxCount = 0
    for key, value in classCount.items():
        if value > maxCount:
            maxCount = value
            maxIndex = key
    return maxIndex

train_x, train_y, test_x, test_y = loadDataSet()
numTestSamples = test_x.shape[0]
matchCount = 0
print("3.Find the most frequent label in k-nearest...")
print("4.Show the result...")
for i in range(numTestSamples):
    predict = myKNN(test_x[i], train_x, train_y, 3)
    print("result is: %d, real answer is: %d"%(predict,test_y[i]))
    if predict == test_y[i]:
        matchCount += 1
accuracy = float(matchCount) / numTestSamples
# 5.输出结果
print("5.Show the accuracy..,")
print("The total number of errors is:%d"%(numTestSamples-matchCount))
print('  The classify accuracy is: %.2f%%'%(accuracy *100))

独特见解