本文介绍了如何在 Pandas 数据框中展开列的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着跟版网的小编来一起学习吧!
                        
                        问题描述
I have the following pandas data frame:
import pandas as pd
import numpy as np
df = pd.DataFrame({
               'fc': [100,100,112,1.3,14,125],
               'sample_id': ['S1','S1','S1','S2','S2','S2'],
               'gene_symbol': ['a', 'b', 'c', 'a', 'b', 'c'],
               })
df = df[['gene_symbol', 'sample_id', 'fc']]
df
Which produces this:
Out[11]:
  gene_symbol sample_id     fc
0           a        S1  100.0
1           b        S1  100.0
2           c        S1  112.0
3           a        S2    1.3
4           b        S2   14.0
5           c        S2  125.0
How can I spread sample_id so that in the end I get this:
gene_symbol    S1   S2
a             100   1.3
b             100   14.0
c             112   125.0
 解决方案 
Use pivot or unstack:
#df = df[['gene_symbol', 'sample_id', 'fc']]
df = df.pivot(index='gene_symbol',columns='sample_id',values='fc')
print (df)
sample_id       S1     S2
gene_symbol              
a            100.0    1.3
b            100.0   14.0
c            112.0  125.0
df = df.set_index(['gene_symbol','sample_id'])['fc'].unstack(fill_value=0)
print (df)
sample_id       S1     S2
gene_symbol              
a            100.0    1.3
b            100.0   14.0
c            112.0  125.0
But if duplicates, need pivot_table or aggregate with groupby or , mean can be changed to sum, median, ...:
df = pd.DataFrame({
               'fc': [100,100,112,1.3,14,125, 100],
               'sample_id': ['S1','S1','S1','S2','S2','S2', 'S2'],
               'gene_symbol': ['a', 'b', 'c', 'a', 'b', 'c', 'c'],
               })
print (df)
      fc gene_symbol sample_id
0  100.0           a        S1
1  100.0           b        S1
2  112.0           c        S1
3    1.3           a        S2
4   14.0           b        S2
5  125.0           c        S2 <- same c, S2, different fc
6  100.0           c        S2 <- same c, S2, different fc
df = df.pivot(index='gene_symbol',columns='sample_id',values='fc')
ValueError: Index contains duplicate entries, cannot reshape
df = df.pivot_table(index='gene_symbol',columns='sample_id',values='fc', aggfunc='mean')
print (df)
sample_id       S1     S2
gene_symbol              
a            100.0    1.3
b            100.0   14.0
c            112.0  112.5
df = df.groupby(['gene_symbol','sample_id'])['fc'].mean().unstack(fill_value=0)
print (df)
sample_id       S1     S2
gene_symbol              
a            100.0    1.3
b            100.0   14.0
c            112.0  112.5
EDIT:
For cleaning set columns name to None and reset_index:
df.columns.name = None
df = df.reset_index()
print (df)
  gene_symbol     S1     S2
0           a  100.0    1.3
1           b  100.0   14.0
2           c  112.0  112.5
这篇关于如何在 Pandas 数据框中展开列的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持跟版网!
The End


大气响应式网络建站服务公司织梦模板
高端大气html5设计公司网站源码
织梦dede网页模板下载素材销售下载站平台(带会员中心带筛选)
财税代理公司注册代理记账网站织梦模板(带手机端)
成人高考自考在职研究生教育机构网站源码(带手机端)
高端HTML5响应式企业集团通用类网站织梦模板(自适应手机端)