#encoding:utf8import numpy as npimport pandas as pd'''分组计算: 拆分:根据什么进行分组 应用:每个分组进行怎样的计算 合并:每个分组的计算结果合并起来'''df = pd.DataFrame( { 'key1':['a','a','b','b','a'], 'key2':['one','two','one','two','one'], 'data1':np.random.randint(1,10,5), 'data2':np.random.randint(1,10,5), })print(df)''' data1 data2 key1 key20 3 9 a one1 3 5 a two2 7 3 b one3 1 1 b two4 2 1 a one'''print(df['data1'])'''0 71 72 43 14 8'''#计算data1列按照key1字段聚合求平均值print(df['data1'].groupby(df['key1']).mean())''' data1 data2 key1 key20 1 3 a one1 4 4 a two2 3 9 b one3 7 3 b two4 1 3 a onekey1a 2b 5'''#除了可以在数据内按照聚合,也可以自定义聚合#这里列表中元素表示data1列的元素位置聚合#data1列下的第1,3,4个元素聚合,第2,5个元素聚合,然后求平均值key = [1,2,1,1,2]print(df)print(df['data1'].groupby(key).mean())''' data1 data2 key1 key20 9 9 a one1 5 2 a two2 4 8 b one3 2 4 b two4 1 3 a one1 52 3'''#分组也可以是多级列表#在groupby中按照元素索引顺序进行排组的依据先后print(df)print(df['data1'].groupby([df['key1'],df['key2']]).sum())print(df['data1'].groupby([df['key2'],df['key1']]).sum())''' data1 data2 key1 key20 9 8 a one1 3 4 a two2 7 8 b one3 4 1 b two4 3 7 a onea[9,3,3]---->one[9,3]....key1 key2a one 12 two 3b one 7 two 4one[9,7,3]---->a[9,3]....key2 key1one a 12 b 7two a 3 b 4'''#查看分组的个数print(df['data1'].groupby([df['key1'],df['key2']]).size())'''key1 key2a one 2 two 1b one 1 two 1'''#按照key1进行分组#生成的是一个DataFrameprint(df.groupby('key1').sum())''' data1 data2key1a 16 12b 10 6'''#对分组之后可进行索引的选取print(df.groupby('key1').sum()['data1'])'''key1a 14b 15'''#当然也可以进行多级分组,然后转换为DataFramemean = df.groupby(['key1','key2']).sum()print(mean)print(mean.unstack())''' data1 data2key1 key2a one 17 6 two 1 3b one 7 3 two 1 6 data1 data2key2 one two one twokey1a 17 1 6 3b 7 1 3 6'''#groupby支持迭代for name,group in df.groupby('key1'): print(name) print(group)'''a data1 data2 key1 key20 9 8 a one1 8 8 a two4 7 7 a oneb data1 data2 key1 key22 6 1 b one3 5 8 b two'''#也可以对groupby转换为字典print(dict(list(df.groupby('key1'))))print(dict(list(df.groupby('key1')))['a'])print(dict(list(df.groupby('key1')))['b'])'''{'a': data1 data2 key1 key20 9 4 a one1 4 3 a two4 5 6 a one, 'b': data1 data2 key1 key22 3 7 b one3 2 8 b two} data1 data2 key1 key20 9 4 a one1 4 3 a two4 5 6 a one data1 data2 key1 key22 3 7 b one3 2 8 b two'''#按照列类型进行分组print(df.groupby(df.dtypes,axis=1).sum())'''0 4 aone1 15 atwo2 10 bone3 8 btwo4 18 aone'''#以上都是按照列表进行分组#下面用其他分组形式来进行分组#通过字典进行分组df = pd.DataFrame( np.random.randint(1,10,(5,5)), columns=list('abcde'), index=['Alice','Bob','Candy','Dark','Emily'])#看一下处理非数字df.ix[1,1:3] = np.NaNprint(df)''' a b c d eAlice 3 3.0 7.0 7 9Bob 4 NaN NaN 3 4Candy 9 5.0 1.0 4 1Dark 6 3.0 9.0 9 9Emily 8 4.0 2.0 3 6'''mapping = { 'a':'red', 'b':'red', 'c':'blue', 'd':'orange', 'e':'blue'}grouped = df.groupby(mapping,axis=1)print(grouped.sum())''' a b c d eAlice 4 5.0 2.0 2 2Bob 3 NaN NaN 8 1Candy 6 7.0 6.0 9 7Dark 7 8.0 1.0 4 3Emily 8 6.0 8.0 3 3 blue orange redAlice 4.0 2.0 9.0Bob 1.0 8.0 3.0Candy 13.0 9.0 13.0Dark 4.0 4.0 15.0Emily 11.0 3.0 14.0可以看对Nan和数字分组计算是按照Nan=0来处理的'''print(grouped.size())print(grouped.count())'''blue 2orange 1red 2 blue orange redAlice 2 1 2Bob 1 1 1Candy 2 1 2Dark 2 1 2Emily 2 1 2Nan是没有统计个数的'''#通过函数进行分组df = pd.DataFrame( np.random.randint(1,10,(5,5)), columns=list('abcde'), index=['Alice','Bob','Candy','Dark','Emily'])#按照行索引def _group_by(idx): print(idx) return idxprint(df.groupby(_group_by).size())print(df.groupby(_group_by).count())'''AliceBobCandyDarkEmily a b c d eAlice 1 1 1 1 1Bob 1 1 1 1 1Candy 1 1 1 1 1Dark 1 1 1 1 1Emily 1 1 1 1 1按照行来进行分组的'''#按照行索引长度def _group_by2(idx): print(idx) return len(idx)print(df.groupby(_group_by2).size())print(df.groupby(_group_by2).count())'''AliceBobCandyDarkEmily a b c d e3 1 1 1 1 14 1 1 1 1 15 3 3 3 3 3'''#通过索引级别进行分组columns = pd.MultiIndex.from_arrays( [ ['China','USA','China','USA','China'], ['A','A','B','C','B'] ], names=['country','index'])df = pd.DataFrame(np.random.randint(1,10,(5,5)),columns=columns)print(df)'''country China USA China USA Chinaindex A A B C B0 9 5 4 5 21 8 5 4 6 92 2 7 3 1 33 2 1 6 5 54 5 3 6 9 4'''print(df.groupby(level='country',axis=1).sum())'''country China USA0 17 121 19 122 19 83 21 134 18 9'''print(df.groupby(level='index',axis=1).sum())'''index A B C0 7 7 61 11 12 32 12 11 93 14 11 44 17 15 8'''