extended read me file
[pid.git] / src / p4_consensus.py
1 #!/usr/bin/python
2
3 #
4 # script that creates consensus sequences files for a given barcode (barcode corresponding to the name of the input aligned files directory (in ../templates/))
5 # input: aligned files directory (in ../templates/)
6 # output: consensus sequences files in specific directory (../templates/dir-<date_and_id>_consensus/)
7
8 import numpy as np
9 from Bio import AlignIO
10 from Bio import SeqIO
11 from Bio.SeqRecord import SeqRecord
12 from Bio.Align.Applications import MuscleCommandline
13 import os
14 import shutil
15 import sys
16 import glob
17 import time
18 import lib_tools as lt
19
20 auto_file_name = str(sys.argv[0])
21 verbose = 0
22 ######
23 # DEF FUNCTIONS
24 ######
25
26
27 ######
28 def make_consensus(file_name):
29 #print 'consensus for file: '+file_name
30 alignment = AlignIO.read(file_name,"fasta")
31 nb_reads_fwd = np.asarray([lt.parse_read_label(seq.id)[1] for seq in alignment], int)
32 #print nb_reads_fwd
33 nb_reads_rev = np.asarray([lt.parse_read_label(seq.id)[2] for seq in alignment], int)
34 #print nb_reads_rev
35 nb_reads = nb_reads_fwd + nb_reads_rev
36
37 alignment = np.asarray(alignment)
38 len_align,len_seq = alignment.shape
39 alpha = np.array(list('ATCG-N'))
40 nuc_counts = np.zeros((alpha.shape[0], len_seq))
41 if(np.sum(nb_reads) >= 3):
42 #compute the consensus sequence taking into account the numbers of fwd/rev reads corresponding to the pID
43 for ni,nuc in enumerate(alpha):
44 nuc_counts[ni] = np.sum((alignment == nuc)*np.repeat(np.array([nb_reads]).T, len_seq, axis=1), axis=0)
45 str_consensus = "".join(alpha[np.argmax(nuc_counts, axis=0)])
46
47 #remove gaps
48 #str_consensus_2 = "".join([nuc for nuc in str_consensus if nuc!='-'])
49 str_consensus_2 = lt.remove_gaps(str_consensus)
50
51 return (str_consensus_2, np.sum(nb_reads_fwd), np.sum(nb_reads_rev))
52 else:
53 if verbose: print 'file: '+file_name+': not enough reads (#reads < 3) to make consensus'
54 return ('nothing',0,0)
55 #######
56
57
58
59 if __name__=='__main__':
60 if(len(sys.argv)==3):
61 # parse the input consensus sequences file name
62 run_dir=str(sys.argv[1]).rstrip('/')+'/'
63 readtype = sys.argv[2]
64
65 barcode_dir_list = glob.glob(run_dir+'bc_*_analysis')
66
67 for analysis_dir in barcode_dir_list:
68 print "analyzing directory:", analysis_dir
69 temp_directories = glob.glob(analysis_dir+'/temp_'+readtype+'*')
70 consensus_fname = analysis_dir+'/consensus_sequences_'+readtype+'.fasta'
71 aligned_reads_fname = analysis_dir+'/aligned_reads_'+readtype+'.fasta'
72
73 with open(consensus_fname, 'w') as consensus_file, \
74 open(aligned_reads_fname, 'w') as aligned_reads_file:
75 # read all the temp files
76 for temp_dir in temp_directories:
77 pID_files = glob.glob(temp_dir+'/*aligned.fasta')
78 for pID_file in pID_files:
79 pID = lt.get_last_part_of_path(pID_file).split('_')[0]
80 with open(pID_file, 'r') as infile:
81 tmp_aln = AlignIO.read(infile, 'fasta')
82 AlignIO.write(tmp_aln, aligned_reads_file, 'fasta')
83 #make consensus and write to file
84 consensus_seq = make_consensus(pID_file)
85 if consensus_seq[1]+consensus_seq[2]>2:
86 consensus_file.write('>'+lt.read_label(pID, consensus_seq[1], consensus_seq[2])+'\n')
87 consensus_file.write(consensus_seq[0]+'\n')
88
89 shutil.rmtree(temp_dir)
90
91 # align the consensus sequences
92 time_start = time.time()
93 aligned_fname = lt.trim_extension(consensus_fname)+'_aligned.fasta'
94 try:
95 cline = MuscleCommandline(input = consensus_fname, out = aligned_fname)
96 cline()
97 except:
98 print "Trouble aligning", consensus_fname
99
100 # read all aligned sequences back in, sort them and write to file again
101 consensus_seqs, counts_good, counts_bad = lt.parse_readfile(aligned_fname)
102 with open(aligned_fname, 'w') as outfile:
103 sorted_pIDs = sorted(consensus_seqs.keys())
104 for pID in sorted_pIDs:
105 for fwd, rev, seq in consensus_seqs[pID]:
106 outfile.write('>'+lt.read_label(pID, fwd, rev)+'\n')
107 outfile.write(seq+'\n')
108
109 time_end = time.time()
110 print 'alignment computation time: ' + str(time_end - time_start)
111
112 else:
113 print auto_file_name+': usage: '+auto_file_name+' <run directory> <read type to work on>'
114