Python最受欢迎用以解析网页的模块bs4,也就是Beautiful Soup的确是一个非常好用的工具。但是我在使用中发现了一个坑,bs4里不同的xml/html解析器会得到不同的结果。不过对于这一点官网文档里也有说明,并详细比较了不同的解析器的结果差异。我一开始所使用的lxml也是最高效,用的人最多的解析器,但是对于我的html页面解析出现偏差,当我切换到Python自带的html.parser,一切太平。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | def make_reg_dic(gene_keggmapper_file): gene_regulation_dic = {"down":[], "up":[]} with open(gene_keggmapper_file, 'r') as gene_reg: for line in gene_reg: line = line.strip() gene, color = line.split("\t") if color == "blue": gene_regulation_dic['down'].append(gene) else: gene_regulation_dic['up'].append(gene) return(gene_regulation_dic) def kegg_html_parse(in_html, in_keggmapper_file, out_gene_list_file, out_count_file): from bs4 import BeautifulSoup gene_regulation_dic = make_reg_dic(gene_keggmapper_file) path_out = open(out_gene_list_file, 'w') path_plot_out = open(out_count_file, 'w') soup = BeautifulSoup(open(in_html), "lxml") path_out.write("\t".join(["pathways", "downregulation", "upregulation"])+'\n') path_plot_out.write("\t".join(["downregulation", "upregulation"])+'\n') for pathway_raw in soup.body.find_all('li'): lines = pathway_raw.text.strip().split("\n") pathway_name = lines[0][9:-4].strip()[:-23] genes = [gene.strip().split(";")[0].split()[1].strip() for gene in lines[1:] if gene] up_comman = list(set(genes) & set(gene_regulation_dic['up'])) down_comman = list(set(genes) & set(gene_regulation_dic['down'])) nup = str(len(up_comman)) ndown = str(len(down_comman)) path_plot_out.write("\t".join([pathway_name, ndown, nup])+'\n') col2 = "; ".join(down_comman) col3 = "; ".join(up_comman) path_out.write("\t".join([pathway_name, col2, col3])+"\n") path_plot_out.close() |