阅读背景:

从分析结果中根据list提取突变信息_sinat_40038704的博客

来源:互联网 
# _*_coding:utf-8_*_
# author: 稻田工作者
# date: 2020-06-13

"""根据原始样本对应的突变信息从数据分析文件中提取检出结果,如:
原始样本LC-BR3对应的突变信息如下:
NM_000245.2:exon14_intron14:c.3028_3028+16del17:p.?
NM_005228.3:exon20:c.2290_2291ins12:p.A763_Y764insFQEA
NM_000245.2:intron13:c.2888-41_2888-2delTAGCCGTCTTTAACAAGCTCTTTCTTTCTCTCTGTTTTAA:p.?
即:需要从分析文件中提取该突变信息对应的sample_name,Depth,frequency,CDS_change,Var_ss对应的文件内index(0,9,10,14,28)
"""

import xlrd
import csv
import pandas as pd

# 根据样本对应突变list生成样本和突变的字典
def generate_sample_dict(work_path,mutation_dict):
    list_file = open(r"%s\list.txt"%work_path,"r")
    for line in list_file:
        s_key = line.strip().split("\t")[0]
        s_value = line.strip().split("\t")[1]
        if s_value is not '':
            if s_key in mutation_dict:
                mutation_dict.get(s_key).append(s_value)
            else:
                mutation_dict.setdefault(s_key,[]).append(s_value)
    return mutation_dict

def extract_info(work_path,data_xlxs_name,mutation_dict):
    data_xlxs = xlrd.open_workbook(r"%s\%s.xlsx"%(work_path,data_xlxs_name),"r")
    data_sheet = data_xlxs.sheet_by_name("SNVIndelHotSpot")
    n_rows = data_sheet.nrows
    # 遍历源数据文件和样本list信息提取数据
    with open(r"%s\%s_filter_data.csv"%(work_path,data_xlxs_name),"w",newline="")as outfile:
        header = ["#sample_name","Depth","frequency","CDS_change","Var_ss"]
        write_csv = csv.DictWriter(outfile,fieldnames=header)
        write_csv.writeheader()
        for n in range(1,n_rows):
            if data_sheet.cell(n,14).value in mutation_dict and mutation_dict.values(data_sheet.cell(n,14).value) in data_sheet.cell(n,0).value:
                for k,v in mutation_dict.items():
                    if k in data_sheet.cell(n,0).value and data_sheet.cell(n,14).value in v:
                        write_csv.writerow({"#sample_name":data_sheet.cell(n,0).value,
                                            "Depth":data_sheet.cell(n,9).value,
                                            "frequency":data_sheet.cell(n,10).value,
                                            "CDS_change":data_sheet.cell(n,14).value,
                                            "Var_ss":data_sheet.cell(n,28).value})
def csv2xls(work_path,data_xlxs_name):
    csv_file = pd.read_csv(r"%s\%s_filter_data.csv"%(work_path,data_xlxs_name),encoding="utf-8",index_col=0,engine='python')
    csv_file.to_excel(r"%s\%s_filter_data.xlsx"%(work_path,data_xlxs_name),sheet_name="filter_data")
    if __name__ == '__main__':
        mutation_dict = {}
        work_path = input("输入list及源分析文件路径: ")
        generate_sample_dict(work_path,mutation_dict)
        data_xlxs_name = input("输入需要提取数据的源文件名称:")
        extract_info(work_path,data_xlxs_name,mutation_dict)
        csv2xls(work_path, data_xlxs_name)# _*_coding:utf-8_*_
# author: 稻田工作者
# date: 20



你的当前访问异常,请进行认证后继续阅读剩余内容。

分享到: