
from numpy import *
import operator
# 它可以列出给定目录下的文件名
from os import listdir
def createDataSet():
    group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels = ['A','A','B','B']
    return group,labels
def file2matrix(filename):
    fr = open(filename)
    arrayOLines = fr.readlines()
#     得到文件行数
    numberOfLines = len(arrayOLines)
    returnMat = zeros((numberOfLines,3))
#     创建返回的NumPy矩阵
    classLabelVector = []
    index = 0
#     解析文件数据到列表
    for line in arrayOLines:
#     移除字符串头尾指定的字符,默认为空格
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]
        index += 1
    return returnMat,classLabelVector
def autoNorm(dataSet):
    newValue = (oldValue-min )/(max-min)
#     从列中选取最小值
    minVals = dataSet.min(0)
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    normDataSet = zeros(shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - tile(minVals, (m,1))
#     特征值相除
    normDataSet = normDataSet/tile(ranges, (m,1))
    return normDataSet, ranges, minVals

def datingClassTest():
    hoRatio = 0.10
    datingDataMat,datingLabels = file2matrix('F:\study\datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
        print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
        if (classifierResult != datingLabels[i]):errorCount += 1.0
    print "the total error rate is %f" % (errorCount/float(numTestVecs))

def classifyPerson():
    resultList = ['not at all','in small doses', 'in large doses']
    percentTats = float(raw_input("percentage of time spent playing video games?"))
    ffMiles = float(raw_input("frequent flier miles earned per year?"))
    iceCream = float(raw_input("liter of ice cream consumed per year?"))
    datingDataMat,datingLabels = file2matrix('F:\study\datingTestSet2.txt')
    norMat, ranges, minVals = autoNorm(datingDataMat)
    inArr = array ([ffMiles, percentTats, iceCream])
    classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
    print "You will probably like this person: ",resultList[classifierResult - 1]

def img2vector(filename):
    returnVect = zeros((1,1024))
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0,32*i+j] = int(lineStr[j])
    return returnVect

def handwritingClassTest():
    hwLabels = []
#     获取目录内容
    trainingFileList = listdir('F:\\study\\trainingDigits')
    m = len(trainingFileList)
#     创建m行1024列的矩阵
    trainingMat = zeros((m,1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]
#         从文件名解析分类数字,文件名的第一个数
#         去掉后缀
        fileStr = fileNameStr.split('.')[0]
#         获取第一个值
        classNumStr = int(fileStr.split('_')[0])
        trainingMat[i,:] = img2vector('F:\\study\\trainingDigits\\%s' % fileNameStr)
    testFileList = listdir('F:\\study\\testDigits')
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('F:\\study\\testDigits\\%s' % fileNameStr)
        classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        print "the classifier came back with: %d, the real answer is : %d" % (classifierResult, classNumStr)

        if(classifierResult != classNumStr) : errorCount +=1.0
    print "\nthe total number of errors is : %d" % errorCount
    print "\nthe total error rate is: %f" % (errorCount/float(mTest))
group, labels = createDataSet()
# print group,labels
[[ 1.   1.1]
 [ 1.   1. ]
 [ 0.   0. ]
 [ 0.   0.1]] ['A', 'A', 'B', 'B']
def classify0(inX, dataSet, labels, k):
    # 计算矩阵行数,一维长度
    dataSetSize = dataSet.shape[0]
    # print dataSetSize

#     tile():重复某个数组tile(A,n),将数组A重复n次,构成一个新的数组
#         下面的代码是将inX重复成dataSetSize行,1列的数据
#         [[0,0],[0,0],[0,0],[0,0]]-[[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]
#         = [[-1,-1.1],[-1,-1],[0,0],[0,-0.1]]
#     下面的步骤就是先计算平方差,再相加后求根号,就是求距离
    diffMat = tile(inX, (dataSetSize,1)) - dataSet
#     平方运算
    sqDiffMat = diffMat**2
#     将向量的每一行相加
    sqDistances = sqDiffMat.sum(axis=1)
#     对数组中的每一个元素开根号
    distances = sqDistances**0.5
#     对数组进行升序排序,返回的是下标不是值!
    sortedDistIndicies = distances.argsort()
    for i in range(k) : 
        voteIlabel = labels[sortedDistIndicies[i]]
#         get返回字典中指定键的值,若不存在则返回默认值,这里设置的是0
#         这里就是给字典中指定键的值加1统计数量,最终是A和B的数量
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
#     sorted(data,cmp=None,key=None,reverse=False)
#         data:待排序数组
#         cmp :带两个参数的比较函数
#         key :是带一个参数的函数
#         reverse:排序规则,True降序
#         获取对象第一个域的值
    sortedClassCount = sorted(classCount.iteritems(),
                              key=operator.itemgetter(1), reverse=True)
#     取第一个则为最大值,也就是当前inX的分类
    return sortedClassCount[0][0]
datingDataMat,datingLabels = file2matrix('F:\study\datingTestSet2.txt')
import matplotlib
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(datingDataMat[:,1], datingDataMat[:,2],15.0*array(datingLabels),15.0*array(datingLabels))


normMat, ranges, minVales = autoNorm(datingDataMat)
[  9.12730000e+04   2.09193490e+01   1.69436100e+00]
percentage of time spent playing video games?56
frequent flier miles earned per year?1654
liter of ice cream consumed per year?0.65
[  9.12730000e+04   2.09193490e+01   1.69436100e+00]
You will probably like this person:  in large doses
# 用\进行转义,否则访问不到
testVector = img2vector('F:\\study\\trainingDigits\\0_9.txt')
array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.])
the classifier came back with: 0, the real answer is : 0
the classifier came back with: 0, the real answer is : 0
the classifier came back with: 0, the real answer is : 0
the classifier came back with: 1, the real answer is : 1
the classifier came back with: 1, the real answer is : 1
the classifier came back with: 2, the real answer is : 2
the classifier came back with: 2, the real answer is : 2
the classifier came back with: 2, the real answer is : 2
the classifier came back with: 3, the real answer is : 3
the classifier came back with: 3, the real answer is : 3
the classifier came back with: 9, the real answer is : 3
the classifier came back with: 3, the real answer is : 3
the classifier came back with: 3, the real answer is : 3
the classifier came back with: 4, the real answer is : 4
the classifier came back with: 4, the real answer is : 4
the classifier came back with: 5, the real answer is : 5
the classifier came back with: 5, the real answer is : 5
the classifier came back with: 5, the real answer is : 5
the classifier came back with: 6, the real answer is : 6
the classifier came back with: 7, the real answer is : 7
the classifier came back with: 6, the real answer is : 8
the classifier came back with: 8, the real answer is : 8
the classifier came back with: 9, the real answer is : 9
the classifier came back with: 9, the real answer is : 9

the total number of errors is : 11

the total error rate is: 0.011628