使用 dict.items() 为大型数据集优化字典查找

按照 Sam Hollenbach 的建议，我可能会对您的代码进行以下 (4) 次更改。import sysfrom Bio import AlignIOfrom Bio import SeqIOfrom Bio.Seq import Seqimport timestart_time = time.time()from collections import defaultdictdatabasefile = sys.argv[1]queryfile = sys.argv[2]file_hits = "./" + sys.argv[2].split("_protein")[0] + "_ZeNovo_hits_v1.txt"file_report = "./" + sys.argv[2].split("_protein")[0] + "_ZeNovo_report_v1.txt"_format = "fasta" #(change 1)output_file = open(file_hits, 'w')output_file_2 = open(file_report,'w')sequences_dict = defaultdict(list)output_file.write("{}\t{}\n".format("protein_query", "hits"))for record in SeqIO.parse(databasefile, _format):    sequences_dict[record.seq].append(record.description) #(change 2)    #sequences_dict[record.description] = str(record.seq)print("processed database in --- {:.3f} seconds ---".format(time.time() - start_time))processed_counter = 0for record in SeqIO.parse(queryfile, _format):    query_seq = record.seq #(change 3)    count = 0    output_file.write("{}\t".format(record.description))    if query_seq in sequences_dict: #(change 4)        count = len(sequences_dict[query_seq])        output_file.write('\t'.join(sequences_dict[query_seq]) + "\n")    processed_counter += 1    print("processed protein", processed_counter)    output_file_2.write(record.description+'\t'+str(count)+                        '\t'+str(len(record.seq))+'\t'+str(record.seq)+'\n')output_file.close()output_file_2.close()print("Done in --- {:.3f} seconds ---".format(time.time() - start_time))更改 #1： - 将格式变量的名称更改为 _format（以避免与 Python 术语“格式”发生冲突，并在使用该名称的代码中进行更改。更改 #2：使用record.seq作为字典的键并将附加record.description到列表中（作为值）更改 #3：无需强制record.seq转换str- 它已经是一个字符串。更改 #4：这 3 行将比在原始代码中迭代字典更快地定位任何匹配的记录。我不确定output_file.write("{}\t".format(record.description))应该如何处理。另外，不能说我已经找到了一个完整的工作程序所需的所有更改。如果您在尝试建议的更改后有任何疑问，请告诉我。

使用 dict.items() 为大型数据集优化字典查找

2回答