merged p4 and p6
authorRichard <richard.neher@tuebingen.mpg.de>
Thu, 26 Sep 2013 16:25:43 +0000 (18:25 +0200)
committerRichard <richard.neher@tuebingen.mpg.de>
Thu, 26 Sep 2013 16:25:43 +0000 (18:25 +0200)
src/lib_tools.py
src/p4_consensus.py
src/p5_gather_aligned_reads.py [deleted file]
src/p6_align_global_per_barcode.py
src/p7_decontaminate_all.py [new file with mode: 0644]
src/p7_decontamination.py
src/p8_correct_all.py [new file with mode: 0644]
src/p8_detect_mutants_indels.py

index 856e665..44a4bc7 100644 (file)
@@ -53,7 +53,7 @@ def parse_read_label(read_label):
     entries = read_label.split('_')
     if len(entries)==3:
         return entries[0], int(entries[1]), int(entries[2]), entries[0]
-    elif len(entries)==4:
+    elif len(entries)>3:
         return entries[0], int(entries[1]), int(entries[2]), entries[3]
     else:
         print "parse_read_label: invalid read label:", read_label
@@ -73,7 +73,7 @@ def parse_readfile(fname):
                 dict_all_reads[pID].append([nb_reads_fwd,nb_reads_rev,seq])
                 count_reads_added_to_dict_all_reads += (nb_reads_fwd+nb_reads_rev)
             else:
-                #print 'Invalid pID: ' + pID
+                print 'Invalid pID: ' + pID
                 count_reads_with_invalid_pID += (nb_reads_fwd+nb_reads_rev)
     return dict_all_reads, count_reads_added_to_dict_all_reads, count_reads_with_invalid_pID
 
@@ -112,9 +112,9 @@ def check_neighbor_plausibility(seq1, seq2, distance_cutoff, verbose = False):
     score = align.localms(seq1, seq2, 1, 0, -1, -1)[0]
     if verbose:
         print 'Alignment score: '+ str(score[2])
-        print score[0]
-        print score[1]
-    return (score[2] >= len(seq1) - DIST_MAX)
+        print len(seq1), score[0]
+        print len(seq2), score[1]
+    return (score[2] >= len(seq1) - distance_cutoff)
 
 def remove_gaps(seq):
     return seq.replace('-','')
index d5668ab..afc6da6 100755 (executable)
@@ -71,6 +71,7 @@ if __name__=='__main__':
 
             with open(consensus_fname, 'w') as consensus_file, \
                     open(aligned_reads_fname, 'w') as aligned_reads_file:
+                # read all the temp files
                 for temp_dir in temp_directories:
                     pID_files = glob.glob(temp_dir+'/*aligned.fasta')
                     for pID_file in pID_files:
@@ -78,12 +79,35 @@ if __name__=='__main__':
                         with open(pID_file, 'r') as infile:
                             tmp_aln = AlignIO.read(infile, 'fasta')
                         AlignIO.write(tmp_aln, aligned_reads_file, 'fasta')
+                        #make consensus and write to file
                         consensus_seq = make_consensus(pID_file)
                         if consensus_seq[1]+consensus_seq[2]>2:
                             consensus_file.write('>'+lt.read_label(pID, consensus_seq[1], consensus_seq[2])+'\n')
                             consensus_file.write(consensus_seq[0]+'\n')
                                 
                     shutil.rmtree(temp_dir)
+
+            # align the consensus sequences
+            time_start = time.time()
+            aligned_fname = lt.trim_extension(consensus_fname)+'_aligned.fasta'
+            try:
+                cline = MuscleCommandline(input = fname, out = aligned_fname)
+                cline()
+            except:
+                print "Trouble aligning", fname
+            
+            # read all aligned sequences back in, sort them and write to file again
+            consensus_seqs, counts_good, counts_bad = lt.parse_readfile(aligned_fname)
+            with open(aligned_fname, 'w') as outfile:
+                sorted_pIDs = sorted(consensus_seqs.keys())
+                for pID in sorted_pIDs:
+                    for fwd, rev, seq in consensus_seqs[pID]:
+                        outfile.write('>'+lt.read_label(pID, fwd, rev)+'\n')
+                        outfile.write(seq+'\n')
+
+            time_end = time.time()
+            print 'alignment computation time: ' + str(time_end - time_start)
+
     else:
         print auto_file_name+': usage: '+auto_file_name+' <run directory> <read type to work on>'
 
diff --git a/src/p5_gather_aligned_reads.py b/src/p5_gather_aligned_reads.py
deleted file mode 100755 (executable)
index 657de1f..0000000
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/usr/bin/python
-
-
-#
-# script that gathers all the aligned reads for a given barcode in one file
-# input: aligned files directory (in ../templates/)
-# output: gathered aligned reads files in a specific 'all-align-in-one-file' directory (in ../templates/)
-
-from Bio import AlignIO
-from Bio import SeqIO
-from Bio.Alphabet import generic_dna
-from Bio.Align import MultipleSeqAlignment
-import sys
-<<<<<<< HEAD
-import time
-import datetime
-import glob
-=======
->>>>>>> fabae8322c3b401072c40dadc2418216e17800d5
-import lib_tools as lt
-import argparse
-
-### parse command line arguments
-parser = argparse.ArgumentParser(description='Gather individual alignment files into one big file')
-parser.add_argument('--indir', required=True, type=str, help = 'directory with individual alignments')
-parser.add_argument('--outfile', default=None, type=str, help = 'file to store the aligned reads')
-args = parser.parse_args()
-auto_file_name = parser.prog
-
-<<<<<<< HEAD
-if __name__=='__main__':
-    if (len(sys.argv)==2):
-        # parse the input aligned files directory name
-        relative_path_to_align_dir=str(sys.argv[1])
-        if(relative_path_to_align_dir[-1]!='/'):
-            relative_path_to_align_dir+='/'
-        path_to_templates = "../templates/"
-        align_dir_basename = relative_path_to_align_dir.split('/')[2]
-        [prefix_date_and_id,file_type,barcode] = [align_dir_basename.split('dir-')[1].split('_')[i] for i in [0,1,2]]
-
-        # create (if necessary) the all-align-in-one-file directory
-        lt.check_and_create_directory(str(path_to_templates+'dir-'+prefix_date_and_id+'_all-align-in-one-file'))
-    
-        # get the list of files to gather 
-        list_files_to_gather = glob.glob(relative_path_to_align_dir+'*')
-    
-        global_list_alignments = []
-    
-        for i in range(len(list_files_to_gather)):
-            list_seq_cur_file = []
-            input_file_name = list_files_to_gather[i]
-            for seq_record in SeqIO.parse(input_file_name, 'fasta'):
-                list_seq_cur_file.append(seq_record)
-            cur_alignment = MultipleSeqAlignment(list_seq_cur_file)
-            global_list_alignments.append(cur_alignment)
-        AlignIO.write(global_list_alignments, str(path_to_templates+'dir-'+prefix_date_and_id+'_all-align-in-one-file/'+prefix_date_and_id+'_all-align-in-one-file_'+barcode+'.fasta'), 'fasta')
-    else:
-        print auto_file_name+': usage: '+auto_file_name+' <aligned reads files directory (in ../templates/)>'
-=======
-
-# parse the input aligned files directory name
-relative_path_to_align_dir = args.indir.rstrip('/')+'/'
-
-if args.outfile is None:
-    path_to_templates = "../templates/"
-    align_dir_basename = relative_path_to_align_dir.split('/')[-1]
-    [prefix_date_and_id,file_type,barcode] = [align_dir_basename.split('dir-')[1].split('_')[i] for i in [0,1,2]]
-    outdir = path_to_templates+'dir-'+prefix_date_and_id+'_all-align-in-one-file'
-    # create (if necessary) the all-align-in-one-file directory
-    lt.check_and_create_directory(outdir)
-    outfname = outdir+'/'+prefix_date_and_id+'_all-align-in-one-file_'+barcode+'.fasta'
-else:
-    outfname = args.outfile
-
-# get the list of files to gather 
-list_files_to_gather = glob.glob(relative_path_to_align_dir+'*.fa*')
-
-global_list_alignments = []    
-for fname in list_files_to_gather:
-    with open(fname, 'r') as pid_alignment:
-        pid = '.'.join(fname.split('.')[:-1]).split('_')[-1]
-        global_list_alignments.append((pid, AlignIO.read(pid_alignment, 'fasta')))
-
-
-global_list_alignments.sort(key = lambda x:x[0])
-with open(outfname, 'w') as outfile: 
-    for pid, aln in global_list_alignments:
-        AlignIO.write(aln, outfname, 'fasta')
-
-
-
->>>>>>> fabae8322c3b401072c40dadc2418216e17800d5
index c25d953..8a47201 100755 (executable)
@@ -39,6 +39,15 @@ if(len(sys.argv)==3):
                 cline()
             except:
                 print "Trouble aligning", fname
+            consensus_seqs, counts_good, counts_bad = lt.parse_readfile(aligned_fname)
+            with open(aligned_fname, 'w') as outfile:
+                sorted_pIDs = sorted(consensus_seqs.keys())
+                for pID in sorted_pIDs:
+                    for fwd, rev, seq in consensus_seqs[pID]:
+                        outfile.write('>'+lt.read_label(pID, fwd, rev)+'\n')
+                        outfile.write(seq+'\n')
+                                      
+
         time_end = time.time()
         print 'alignment computation time: ' + str(time_end - time_start)
 
diff --git a/src/p7_decontaminate_all.py b/src/p7_decontaminate_all.py
new file mode 100644 (file)
index 0000000..b959b07
--- /dev/null
@@ -0,0 +1,29 @@
+#!/usr/bin/python
+
+import os
+import sys
+import time
+import glob 
+
+####
+# retrieve the temp reads file names to align in the specific cluster directory (in ../templates/) given in input
+# the given cluster directory path must be relative to the current directory (src)
+####
+if(len(sys.argv)==2):
+    run_dir = str(sys.argv[1])
+    run_dir = run_dir.rstrip('/')+'/'
+
+    run  = run_dir[-2]
+    barcode_dir_list = glob.glob(run_dir+'bc_*_analysis')
+    for analysis_dir in barcode_dir_list:
+        barcode = analysis_dir.split('_')[-2]
+        #collect the list of temp reads files to align
+        cmd = 'qsub -cwd -l h_rt=00:59:00 -l h_vmem=10G ./src/p7_decontamination.py '+analysis_dir+' ./data/ref_seq_for_decontamination.fasta filtered '+'"'+run+' '+barcode+'"'
+        print cmd
+        os.system(cmd)
+
+else:
+    print sys.argv[1]+': usage: '+auto_file_name+' <directory of a run>'
+
+
+
index 5d4839d..1d123c6 100755 (executable)
@@ -16,6 +16,7 @@ from collections import defaultdict
 import sys
 import datetime
 import time
+sys.path.append('./src/')
 import lib_tools as lt
 
 
@@ -133,60 +134,40 @@ if (len(sys.argv)==5):
     dict_count_pIDs_to_bad_aligned_file=defaultdict(int)
     dict_count_pIDs_from_contamination=defaultdict(int)
     dict_count_pIDs_to_decontaminated_file=defaultdict(int)
-    list_of_reads_to_write_in_bad_aligned_file=[]
-    list_of_reads_to_write_in_decontaminated_file=[]
-    list_of_reads_from_contamination=[]
-    count_reads_no_good_alignments = 0
-    count_reads_good_alignments = 0
-
-    with open(barcode_dir+'aligned_reads_'+readtype+'.fasta','r') as aligned_reads_file:
-        for record in SeqIO.parse(aligned_reads_file,'fasta'):
-            pID,nb_reads_fwd,nb_reads_rev,pID_orig = record.id.split('_')[0:4]
-            nb_reads_fwd = int(nb_reads_fwd)
-            nb_reads_rev = int(nb_reads_rev)
-            read = record.seq
-            if pID in dict_reclassif.keys(): # bad alignment with the associated ref. seq.
-                if len(dict_reclassif[pID])>0: # contamination from an other run/barcode
-                    list_of_reads_from_contamination.append(record)
-                    dict_count_pIDs_from_contamination[pID]+=(nb_reads_fwd+nb_reads_rev)
-                else: # bad alignment with all the ref. seq.
-                    list_of_reads_to_write_in_bad_aligned_file.append(record)
-                    dict_count_pIDs_to_bad_aligned_file[pID]+=(nb_reads_fwd+nb_reads_rev)
-                    count_reads_no_good_alignments += (nb_reads_fwd+nb_reads_rev)
-            else: # good alignment with the associated ref. seq.
-                list_of_reads_to_write_in_decontaminated_file.append(record)
-                dict_count_pIDs_to_decontaminated_file[pID]+= (nb_reads_fwd+nb_reads_rev)
-                count_reads_good_alignments += (nb_reads_fwd+nb_reads_rev)
-    
-        
-   # print 'RUN-barcode '+true_seq_id+': #pIDs with better alignments from other barcodes: '+str(count_pIDs_reclassif)
-   # print '-----> #pIDs with no good alignments: '+ str(count_pIDs_no_good_alignments)
-    print '-----> #reads with no good alignments: '+ str(count_reads_no_good_alignments)
-    print '-----> #reads with good alignments: '+ str(count_reads_good_alignments)
-    print '##########'
-
-    # write the decontaminated aligned filtered reads file
-    with open(barcode_dir+'aligned_reads_'+readtype+'_decontaminated.fasta','w') as outfile:
-        print 'write decontaminated file for run-barcode: '+str(true_seq_id)
-        for record in list_of_reads_to_write_in_decontaminated_file:
-            outfile.write('>'+str(record.id)+'\n')
-            outfile.write(str(record.seq)+'\n')
-        print '-- #reads written: '+str(len(list_of_reads_to_write_in_decontaminated_file))
-                
-    # write the bad aligned filtered reads file
-    with open(barcode_dir+'aligned_reads_'+readtype+'_bad_aligned.fasta','w') as outfile:
-        print 'write bad aligned reads file for run-barcode: '+str(true_seq_id)
-        for record in list_of_reads_to_write_in_bad_aligned_file:
-            outfile.write('>'+str(record.id)+'\n')
-            outfile.write(str(record.seq)+'\n')
-        print '-- #reads written: '+str(len(list_of_reads_to_write_in_bad_aligned_file))
+
+    with open(barcode_dir+'aligned_reads_'+readtype+'_decontaminated.fasta','w') as outfile_good, \
+            open(barcode_dir+'aligned_reads_'+readtype+'_bad_aligned.fasta','w') as outfile_bad:
+        sorted_pIDs = sorted(dict_all_reads.keys())
+        for pID in sorted_pIDs:
+            if pID in dict_reclassif:
+                read_count=0
+                for read in dict_all_reads[pID]:
+                    outfile_bad.write('>'+lt.read_label(pID, read[0], read[1])+'\n')
+                    outfile_bad.write(read[2]+'\n')
+                    read_count+=read[0] + read[1]
+                if len(dict_reclassif[pID])>0:
+                    dict_count_pIDs_from_contamination[pID] = read_count
+                else:
+                    dict_count_pIDs_to_bad_aligned_file[pID] = read_count
+            else:
+                read_count=0
+                for read in dict_all_reads[pID]:
+                    outfile_good.write('>'+lt.read_label(pID, read[0], read[1])+'\n')
+                    outfile_good.write(read[2]+'\n')                
+                    read_count+=read[0] + read[1]
+                dict_count_pIDs_to_decontaminated_file[pID]+=read_count
+         print '-- # good reads written: '+str(sum(dict_count_pIDs_to_decontaminated_file.values()))
+         print '-- # contaminant reads written: '+str(sum(dict_count_pIDs_from_contamination.values()))
+         print '-- # reads of unknown origin written: '+str(sum(dict_count_pIDs_to_bad_aligned_file.values()))
 
     # write the consensus sequences file
     with open(barcode_dir+'consensus_sequences_'+readtype+'_decontaminated.fasta','w') as outfile:
         print 'write decontaminated file for run-barcode: '+str(true_seq_id)
-        for pID,rec in dict_cons_seq.iteritems():
+        consensus_pIDs = sorted(dict_cons_seq.keys())
+        for pID in consensus_pIDs:
             if pID not in dict_reclassif:
-                outfile.write('>'+lt.read_label(pID, rec[0][0], rec[0][1])+'\n')
+                rec = dict_cons_seq[pID][0]
+                outfile.write('>'+lt.read_label(pID, rec[0], rec[1])+'\n')
                 outfile.write(rec[0][2]+'\n')
 
 
diff --git a/src/p8_correct_all.py b/src/p8_correct_all.py
new file mode 100644 (file)
index 0000000..179c6b6
--- /dev/null
@@ -0,0 +1,28 @@
+#!/usr/bin/python
+
+import os
+import sys
+import time
+import glob 
+
+####
+# retrieve the temp reads file names to align in the specific cluster directory (in ../templates/) given in input
+# the given cluster directory path must be relative to the current directory (src)
+####
+if(len(sys.argv)==3):
+    run_dir = str(sys.argv[1])
+    run_dir = run_dir.rstrip('/')+'/'
+    read_type = sys.argv[2]
+
+    barcode_dir_list = glob.glob(run_dir+'bc_*_analysis')
+    for analysis_dir in barcode_dir_list:
+        #collect the list of temp reads files to align
+        cmd = 'qsub -cwd -l h_rt=00:59:00 -l h_vmem=10G ./src/p8_detect_mutants_indels.py '+analysis_dir+ ' ' +read_type
+        print cmd
+        os.system(cmd)
+
+else:
+    print sys.argv[1]+': usage: '+sys.argv[0]+' <directory of a run> <read_type>'
+
+
+
index e262b27..c7346b0 100755 (executable)
@@ -16,9 +16,11 @@ from Bio.Seq import Seq
 from Bio.Alphabet import generic_dna
 from collections import Counter
 from collections import defaultdict
+from Bio.pairwise2 import align
 from itertools import imap
 import operator
 import sys
+sys.path.append('./src')
 import datetime
 import time
 import lib_tools as lt
@@ -109,8 +111,8 @@ if (len(sys.argv) == 3):
     print 'nb occ pID_a at pos lim_pID_a in list_occ_pIDs_dec: '+str(list_occ_pIDs_dec[lim_pID_a][1])
 
     for ii_a, pID_a in enumerate(list_occ_pIDs_dec[:lim_pID_a]):
-        # this condition should always evaluate to true
-        if dict_neighbor_state_pIDs[pID_a[0]] == False:
+        # this condition should always evaluate to false
+        if dict_neighbor_state_pIDs[pID_a[0]] == True:
             continue
         # loop over rare pid
         for ii,pID_r in enumerate(list_occ_pIDs_inc[:-(ii_a+1)]):
@@ -142,9 +144,8 @@ if (len(sys.argv) == 3):
                             
                         #if(pID_)
 
-
                         for nf, nr, seq_r in neighbor_candidates:
-                            if lt.check_neighbor_plausibility(seq_a, seq_r, DIST_MAX):
+                            if lt.check_neighbor_plausibility(seq_a, seq_r, DIST_MAX, verbose=False):
                                 dict_neighbors[pID_a[0]].append(pID_r[0])
                                 dict_neighbor_of[pID_r[0]] = pID_a[0]
                                 dict_neighbor_state_pIDs[pID_r[0]] = True