我这里有字典:
dict_assembly = {'ind1gene1':'individual1', 'ind1gene2':'individual1','ind1gene3':'individual1', 'ind2gene1':'individual2', 'ind2gene2':'individual2','ind2gene3':'individual2', 'ind3gene1':'individual3', 'ind3gene2':'individual3','ind3gene3':'individual3','ind4gene1':'individual4','ind4gene2':'individual4','ind4gene3':'individual4','ind4gene4':'individual4'}
dict_bhit = {'ind1gene1':'AAAAA', 'ind1gene2':'BBBBB','ind1gene3':'CCCCC', 'ind2gene1':'AAAAA', 'ind2gene2':'BBBBB','ind2gene3':'BBBBB', 'ind3gene1':'AAAAA', 'ind3gene2':'BBBBB','ind3gene3':'CCCCC','ind4gene1':'AAAAA','ind4gene2':'BBBBB','ind4gene3':'CCCCC','ind4gene4':'DDDDD'}
dict_identity = {'ind1gene1':'98','ind2gene1':'96','ind3gene1':'95','ind4gene1':'96','indi5gene1':'94','ind1gene2':'67','ind2gene2':'76','ind3gene2':'80','ind4gene2':'77','ind5gene2':'76','ind1gene3':'98','ind2gene3':'97','ind3gene3':'96','ind4gene3':'96','ind4gene4':'40'}
data = {} # temporary dictionary
用于此示例的代码分为两个块。
第一部分:
import pandas as pd
import time
start = time.time()
matrix_file = open("concatenated.matrix", "w" )
col_subject = ['query', 'subject']
df_accession = pd.DataFrame(dict_bhit.items(), columns=col_subject)
col_genome = ['query', 'genome']
df_assembly = pd.DataFrame(dict_assembly.items(), columns=col_genome)
df_assembly['subject'] = df_assembly['query'].map(df_accession.set_index('query')['subject'])
matrix = pd.get_dummies(df_assembly.set_index('genome')['subject']).max(level=0).max(level=0, axis=1)
matrix.to_csv(matrix_file, sep='\t', header=True, index=True)
print matrix
end = time.time()
print 'This step spent',round(end - start, 4), 'seconds\n'
相关分类