continued stream lining
authorRichard <richard.neher@tuebingen.mpg.de>
Thu, 19 Sep 2013 12:58:16 +0000 (14:58 +0200)
committerRichard <richard.neher@tuebingen.mpg.de>
Thu, 19 Sep 2013 12:58:16 +0000 (14:58 +0200)
src/lib_tools.py
src/p2_sort.py
src/p3_cluster_align.py
src/p4_consensus.py
src/p6_align_global_per_barcode.py
src/p7_detect_mutants_indels.py

index 3806055..a07f1c8 100644 (file)
@@ -1,4 +1,5 @@
 from Bio.pairwise2 import align
+from Bio import SeqIO
 from collections import defaultdict
 import os
 
@@ -64,7 +65,7 @@ def parse_readfile(fname):
     with open(fname, 'r') as reads_file:
         # for all the valid reads (without Ns), add the read to dict_all_reads
         for record in SeqIO.parse(reads_file, 'fasta'):
-            pID,nb_reads_fwd,nb_reads_rev = record.id.split('_')[0:2]
+            pID,nb_reads_fwd,nb_reads_rev = record.id.split('_')[0:3]
             nb_reads_fwd = int(nb_reads_fwd)
             nb_reads_rev = int(nb_reads_rev)
             seq = str(record.seq)
index fb3a3ef..16f7ac2 100755 (executable)
@@ -10,7 +10,7 @@
 
 import numpy as np
 import matplotlib.pyplot as plt
-import os
+import glob
 import sys
 import datetime
 import time
@@ -49,7 +49,7 @@ if (len(sys.argv) ==3):
 
         # create directory for batch 0
         batch=0
-        temp_pid_files_dir =analysis_dir+'/temp_'+readtype+'_'+"{0:04d}".format(batch)
+        temp_pid_files_dir =bc_dir+'/temp_'+readtype+'_'+"{0:04d}".format(batch)
         lt.check_and_create_directory(temp_pid_files_dir)
         dict_pIDs = defaultdict(list)
     
@@ -70,10 +70,10 @@ if (len(sys.argv) ==3):
                             output_pID_file.write(str(read[1]+'\n'))
                             count+=1
                             if(count%500==0):
-                        print 'count = ' + str(count)
+                                print 'count = ' + str(count)
                         if ((batch+1)*batchsize<pii):
                             batch+=1
-                            temp_pid_files_dir =analysis_dir+'/temp_'+readtype+'_'+"{0:04d}".format(batch)
+                            temp_pid_files_dir =bc_dir+'/temp_'+readtype+'_'+"{0:04d}".format(batch)
                             lt.check_and_create_directory(temp_pid_files_dir)
                             
                             print 'total : ' + str(count)
index ec3e4f7..dce60a4 100755 (executable)
@@ -13,19 +13,21 @@ import glob
 # the given cluster directory path must be relative to the current directory (src)
 ####
 if(len(sys.argv)==2):
-    analysis_dir = str(sys.argv[1])
-    analysis_dir = analysis_dir.rstrip('/')+'/'
+    run_dir = str(sys.argv[1])
+    run_dir = run_dir.rstrip('/')+'/'
 
-    #collect the list of temp reads files to align
-    list_temp_dirs = glob.glob(analysis_dir+'temp_*')
-    #run one job per file to align on the cluster
-    for temp_dir in list_temp_dirs:
-        cmd = 'qsub -cwd -l h_rt=12:00:00 -l h_vmem=10G ./src/p3_align.py '+temp_dir
-        print cmd
-        os.system(cmd)
+    barcode_dir_list = glob.glob(run_dir+'bc_*_analysis')
+    for analysis_dir in barcode_dir_list:
+        #collect the list of temp reads files to align
+        list_temp_dirs = glob.glob(analysis_dir+'/temp_*')
+        #run one job per file to align on the cluster
+        for temp_dir in list_temp_dirs:
+            cmd = 'qsub -cwd -l h_rt=12:00:00 -l h_vmem=10G ./src/p3_align.py '+temp_dir
+            print cmd
+            os.system(cmd)
 
 else:
-    print auto_file_name+': usage: '+auto_file_name+' <specific cluster directory relative path (in ../templates/)>'
+    print auto_file_name+': usage: '+auto_file_name+' <directory of a run>'
 
 
 
index f2ef9ca..948ae53 100755 (executable)
@@ -57,30 +57,34 @@ def make_consensus(file_name):
 if __name__=='__main__':
     if(len(sys.argv)==3):
         # parse the input consensus sequences file name
-        analysis_dir=str(sys.argv[1]).rstrip('/')+'/'
-        readtype = '_'+sys.argv[2]
+        run_dir=str(sys.argv[1]).rstrip('/')+'/'
+        readtype = sys.argv[2]
         
-        temp_directories = glob.glob(analysis_dir+'temp_'+readtype+'*')
-        consensus_fname = analysis_dir+'consensus_sequences'+readtype+'.fasta'
-        aligned_reads_fname = analysis_dir+'aligned_reads'+readtype+'.fasta'
+        barcode_dir_list = glob.glob(run_dir+'bc_*_analysis')
 
-        with open(consensus_fname, 'w') as consensus_file, \
-                open(aligned_reads_fname, 'w') as aligned_reads_file:
-            print temp_directories
-            for temp_dir in temp_directories:
-                pID_files = glob.glob(temp_dir+'/*aligned.fasta')
-                print pID_files
-                for pID_file in pID_files:
-                    pID = lt.get_last_part_of_path(pID_file).split('_')[0]
-                    with open(pID_file, 'r') as infile:
-                        tmp_aln = AlignIO.read(infile, 'fasta')
-                    AlignIO.write(tmp_aln, aligned_reads_file, 'fasta')
-                    consensus_seq = make_consensus(pID_file)
-                    if consensus_seq[1]+consensus_seq[2]>2:
-                        consensus_file.write('>'+lt.read_label(pID, consensus_seq[1], consensus_seq[2])+'\n')
-                        consensus_file.write(consensus_seq[0]+'\n')
+        for analysis_dir in barcode_dir_list:
+            print "analyzing directory:", analysis_dir
+            temp_directories = glob.glob(analysis_dir+'/temp_'+readtype+'*')
+            consensus_fname = analysis_dir+'/consensus_sequences_'+readtype+'.fasta'
+            aligned_reads_fname = analysis_dir+'/aligned_reads_'+readtype+'.fasta'
 
-                shutil.rmtree(temp_dir)
+            with open(consensus_fname, 'w') as consensus_file, \
+                    open(aligned_reads_fname, 'w') as aligned_reads_file:
+                print temp_directories
+                for temp_dir in temp_directories:
+                    pID_files = glob.glob(temp_dir+'/*aligned.fasta')
+                    print pID_files
+                    for pID_file in pID_files:
+                        pID = lt.get_last_part_of_path(pID_file).split('_')[0]
+                        with open(pID_file, 'r') as infile:
+                            tmp_aln = AlignIO.read(infile, 'fasta')
+                        AlignIO.write(tmp_aln, aligned_reads_file, 'fasta')
+                        consensus_seq = make_consensus(pID_file)
+                        if consensus_seq[1]+consensus_seq[2]>2:
+                            consensus_file.write('>'+lt.read_label(pID, consensus_seq[1], consensus_seq[2])+'\n')
+                            consensus_file.write(consensus_seq[0]+'\n')
+
+                    shutil.rmtree(temp_dir)
     else:
-        print auto_file_name+': usage: '+auto_file_name+' <aligned reads files directory (in ../templates/)>'
+        print auto_file_name+': usage: '+auto_file_name+' <run directory> <read type to work on>'
 
index 2fe6577..c25d953 100755 (executable)
@@ -9,7 +9,7 @@
 import numpy as np
 from Bio import SeqIO
 from Bio.Align.Applications import MuscleCommandline
-import os
+import glob
 import sys
 import time
 import lib_tools as lt
@@ -22,24 +22,25 @@ auto_file_name = str(sys.argv[0])
 ######
 
 
-if(len(sys.argv)>1):
+if(len(sys.argv)==3):
     # parse the input consensus sequences file name
     rundir=str(sys.argv[1]).rstrip('/')+'/'
-    if len(sys.argv)>2:
-        analysis_type = '_'+sys.argv[2]
+    read_type = '_'+sys.argv[2]
 
     barcode_directories = glob.glob(rundir+'bc_*_analysis*')
     for dname in barcode_directories:
-        consensus_file_list = glob.glob(dname+'/consensus*fasta')
+        consensus_file_list = glob.glob(dname+'/consensus*'+read_type+'.fasta')
         time_start = time.time()
         for fname in consensus_file_list:
             aligned_fname = lt.trim_extension(fname)+'_aligned.fasta'
 
-            cline = MuscleCommandline(input = fname, out = aligned_fname)
-            cline()
-
+            try:
+                cline = MuscleCommandline(input = fname, out = aligned_fname)
+                cline()
+            except:
+                print "Trouble aligning", fname
         time_end = time.time()
         print 'alignment computation time: ' + str(time_end - time_start)
 
 else:
-    print auto_file_name + ': usage: '+ auto_file_name + ' <directory containing the directories for each barcode containing the consensus sequence files'
+    print auto_file_name + ': usage: '+ auto_file_name + ' <run director> <read type>'
index 9cd7801..1da61c0 100755 (executable)
@@ -69,9 +69,9 @@ if (len(sys.argv) == 3):
     for pID,reads in dict_all_reads.iteritems():
         pID_abundance[pID] = sum([x[0]+x[1] for x in reads])
 
-    print '#pIDs occurring only once: ' + str([x==1 for x in pID_abundance.values()])
-    print '#pIDs occurring only twice: ' + str([x==2 for x in pID_abundance.values()])
-    print '#pIDs occurring at least 3 times: ' + str([x>2 for x in pID_abundance.values()])
+    print '#pIDs occurring only once: ' + str(sum([x==1 for x in pID_abundance.values()]))
+    print '#pIDs occurring only twice: ' + str(sum([x==2 for x in pID_abundance.values()]))
+    print '#pIDs occurring at least 3 times: ' + str(sum([x>2 for x in pID_abundance.values()]))
     ####
 
 
@@ -200,4 +200,4 @@ if (len(sys.argv) == 3):
 
 
 else:
-    print auto_file_name + ': usage: '+ auto_file_name + ' <directory with reads from a barcode>'
+    print auto_file_name + ': usage: '+ auto_file_name + ' <directory with reads from a barcode> <type of read>'