reorganized file names etc
authorRichard <richard.neher@tuebingen.mpg.de>
Wed, 18 Sep 2013 14:17:03 +0000 (16:17 +0200)
committerRichard <richard.neher@tuebingen.mpg.de>
Wed, 18 Sep 2013 14:17:03 +0000 (16:17 +0200)
14 files changed:
configFile.py [new file with mode: 0644]
src/configFile.py [deleted file]
src/lib_tools.py
src/p1_trim_and_filter.py
src/p2_bis_send_big_files_to_cluster.py
src/p2_sort.py
src/p2_sort_all.py [new file with mode: 0644]
src/p3_align.py
src/p3_cluster_align.py [new file with mode: 0755]
src/p3_cluster_align_aux.py [deleted file]
src/p3_cluster_align_main.py [deleted file]
src/p3_cluster_align_script.py [deleted file]
src/p4_consensus.py
src/p6_align_global_per_barcode.py

diff --git a/configFile.py b/configFile.py
new file mode 100644 (file)
index 0000000..9657bae
--- /dev/null
@@ -0,0 +1,33 @@
+###
+# Configuration parameters for: p1_trim_and_filter
+###
+####
+# 454
+###
+#cfg={
+#    'prefix_date_and_id':'2013-09-04-Run1',
+#    'p5_virus_match':'TGGCAGTCTAGCAGAAGAAG',
+#    'p3_virus_match':'CCTCAGGAGGGGACCCAG',
+#    'barcodes':['ACGT', 'TACG', 'CGTA'],
+#    'input_data_file':'subsample.fastq',
+#    'reverse':True,
+#    'min_occ_same_pid':1,
+#    'min_length_pid':10,
+#    'barcode_length_range':range(3,5)
+#}
+#
+###
+# iontorrent
+###
+cfg={
+    'runid':'2013-09-04-Run1',
+    'p5_virus_match':'TGGCAGTCTAGCAGAAGAAG',
+    'p3_virus_match':'CCTCAGGAGGGGACCCAG',
+    'barcodes':['TACG','ACGT','CGTA','GTAC'],
+    'input_data_file':'data/subsample_iontorrent.fastq',
+    'reverse':False,
+    'min_occ_same_pid':1,
+    'min_length_pid':8,
+    'barcode_length_range':range(3,5)
+}
+
diff --git a/src/configFile.py b/src/configFile.py
deleted file mode 100644 (file)
index 3328f8c..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-###
-# Configuration parameters for: p1_trim_and_filter
-###
-cfg={
-    'prefix_date_and_id':'2013-09-04-Run1',
-    'p5_virus_match':'TGGCAGTCTAGCAGAAGAAG',
-    'p3_virus_match':'CCTCAGGAGGGGACCCAG',
-    'barcodes':['TACG','ACGT','CGTA','GTAC'],
-    'input_data_file':'subsample_iontorrent.fastq',
-    'reverse':False,
-    'min_occ_same_pid':1,
-    'min_length_pid':8,
-    'barcode_length_range':range(3,5)
-}
-
index f983515..3806055 100644 (file)
@@ -33,6 +33,8 @@ def check_and_create_directory(dir_name):
 
 def get_last_part_of_path(fname):
     return fname.split('/')[-1]
+def get_base_of_path(fname):
+    return '/'.join(fname.split('/')[:-1])
 
 def trim_extension(fname):
     return '.'.join(fname.split('.')[:-1])
@@ -40,6 +42,20 @@ def trim_extension(fname):
 def get_extention(fname):
     return fname.split('.')[-1]
 
+def read_label(pID, nfwd, nrev, orig_pID=None):
+    if orig_pID:
+        return '_'.join(map(str, [pID, nfwd, nrev]))
+    else:
+        return '_'.join(map(str, [pID, nfwd, nrev, orig_pID]))
+
+def parse_read_label(read_label):
+    entries = read_label.split('_')
+    if len(entries)==3:
+        return entries[0], int(entries[1]), int(entries[2]), entries[0]
+    elif len(entries)==4:
+        return entries[0], int(entries[1]), int(entries[2]), entries[3]
+    else:
+        print "parse_read_label: invalid read label:", read_label
 
 def parse_readfile(fname):
     count_reads_added_to_dict_all_reads=0
@@ -60,6 +76,37 @@ def parse_readfile(fname):
                 count_reads_with_invalid_pID += (nb_reads_fwd+nb_reads_rev)
     return dict_all_reads, count_reads_added_to_dict_all_reads, count_reads_with_invalid_pID
 
+def make_file_name(name_parts):
+    fields = []
+    for f in ['bc', 'pid', 'min']:        
+        if f in name_parts:
+            fields.extend([f, str(name_parts[f])])
+
+    fields.extend(map(str, name_parts['other']))
+    return name_parts['runid']+'/'+'_'.join(fields)+'.'+name_parts['ext']
+
+def parse_file_name(fname):
+    entries = fname.split('/')[-1].split('_')
+    fields=[]
+    remainder=0
+    for f in ['bc', 'pid', 'min']:
+        if entries[remainder]==f: 
+            fields[f]=entries[remainder+1]
+            try:
+                fields[f]=int(fields[f])
+            except:
+                pass
+            remainder+=2
+
+    fields['other']=entries[remainder:]
+    return fields
+
+def make_dir_name(runid, barcode, dir_label):
+    return 'dir-'+'_'.join([runid, barcode, dir_label])
+
+def parse_file_name(dir_name):
+    return dir_name[4:].split('_')[:2]
+
 def check_neighbor_plausibility(seq1, seq2, distance_cutoff, verbose = False):
     score = align.localms(seq1, seq2, 1, 0, -1, -1)[0]
     if verbose:
index c12f850..aa06f02 100755 (executable)
@@ -34,12 +34,12 @@ class struct_var_set:
         #if(CONFIG_FILE_NAME in sys.argv):
         try:
             execfile(CONFIG_FILE_NAME)
-            if ('prefix_date_and_id' in cfg.keys() and 'p5_virus_match' in cfg.keys() 
+            if ('runid' in cfg.keys() and 'p5_virus_match' in cfg.keys() 
                 and 'p3_virus_match' in cfg.keys() and 'input_data_file' in cfg.keys() 
                 and 'barcodes' in cfg.keys() and 'reverse' in cfg.keys()):
 
                 # MANDATORY variables in the configuration file:
-                self.prefix_date_and_id = cfg['prefix_date_and_id']
+                self.runid = cfg['runid']
                 self.p5_virus_match = cfg['p5_virus_match']
                 self.p3_virus_match = cfg['p3_virus_match']
                 self.match_seqs = [self.p5_virus_match, self.p3_virus_match]
@@ -82,7 +82,7 @@ class struct_var_set:
                 #self.templates={}
             else:
                 print 'Problem: at least one of the following (mandatory) parameters is missing in ' + CONFIG_FILE_NAME + ':'
-                print 'prefix_date_and_id, p5_virus_match, p3_virus_match, p3_adaptor, barcodes, input_data_file, reverse.' 
+                print 'runid, p5_virus_match, p3_virus_match, p3_adaptor, barcodes, input_data_file, reverse.' 
         #else:
         except:
             print 'The configuration file ' + CONFIG_FILE_NAME + ' is missing or mistaken !'
@@ -139,7 +139,7 @@ def retrieve_pID_barcode_and_sequence(res, fwd, scores, seq, seq_id):
 #####
 def filter_reads(res):
     time_start = time.time()
-    with open(str('../data/'+res.input_data_file), 'r') as seq_file:
+    with open(str(res.input_data_file), 'r') as seq_file:
         file_format = res.input_data_file.split('.')[-1]
         for record in SeqIO.parse(seq_file, file_format):
             tmp_seq = str(record.seq)
@@ -193,7 +193,8 @@ def logfile_output(res, barcode):
     res is the structure containing the filtering operation results
     barcode
     '''
-    with open('../data/' + res.prefix_date_and_id + '_' + str(barcode) + '_SW_match_summary.txt', 'w') as log_file:
+    lt.check_and_create_directory(res.runid)
+    with  open(res.runid+'/bc_'+barcode+'_summary.txt', 'w') as log_file:
         log_file.write('Barcode: ' + str(barcode) + '\n')
         log_file.write('Total number of reads (all barcodes): ' + str(res.count) + '\n')
         log_file.write('Number of pIDs: ' + str(len(res.good_reads[bc])) + '\n')
@@ -210,7 +211,11 @@ def export_good_reads_to_fasta_file_for_consensus_compress(res, barcode, mosp):
     barcode
     mosp min_occ_same_pid
     '''
-    with open('../data/' + res.prefix_date_and_id + '_' + barcode + '_filtered-reads-SW-mosp-' + str(mosp) + '.fasta', 'w') as out_file:
+    ####
+    # make directory for this run
+    ####
+    lt.check_and_create_directory(res.runid)
+    with open(res.runid+'/bc_'+barcode+'_min_'+str(mosp)+'.fasta', 'w') as out_file:
         for pID in sorted(res.good_reads[barcode].keys()):
             count_seq = res.good_reads[barcode][pID]
             if len(count_seq) >= mosp: # write the reads corresponding to pID if this one appears at least mosp times
@@ -221,7 +226,7 @@ def export_good_reads_to_fasta_file_for_consensus_compress(res, barcode, mosp):
                 for read_to_write in dict_seq.keys():
                     nb_fwd = dict_seq[read_to_write]['fwd']
                     nb_rev = dict_seq[read_to_write]['rev']
-                    out_file.write('>' + pID + '_' + str(nb_fwd) + '_' + str(nb_rev) + '\n')
+                    out_file.write('>' + lt.read_label(pID, nb_fwd, nb_rev) + '\n')
                     out_file.write(read_to_write+'\n')
                 
 
@@ -243,7 +248,7 @@ def plot_read_length_distribution(res):
         plt.ylabel('number of reads')
         plt.legend(loc = 2)
         plt.xlim([0,500])
-        plt.savefig('../figures/' + res.prefix_date_and_id + '_SW_matching_read_length.pdf')
+        plt.savefig(res.runid + '/read_length.pdf')
 
 #####
 def plot_number_of_reads_per_pID(res):
@@ -259,7 +264,8 @@ def plot_number_of_reads_per_pID(res):
         ax.set_xscale('log')
         plt.ylim([0.8,1.2e4])
         plt.legend(loc=1)
-        plt.savefig('../figures/' + res.prefix_date_and_id + '_SW_PID_copy_number.pdf')
+        plt.savefig(res.runid + '/copy_number.pdf')
+
 
         
 
@@ -277,6 +283,7 @@ if __name__=='__main__':
     except:
         print 'Problem for initialisation: there is something wrong with the configuration file ' + CONFIG_FILE_NAME + '!\n'
 
+        
     ####
     # Filter the data file
     ####
index be299e6..af5a381 100755 (executable)
@@ -31,13 +31,10 @@ if (len(sys.argv)==3):
     #parse the input directory name
     ####
     path_to_templates = '../templates/'
-    #temp_dir_basename = dir_to_check[len(path_to_templates):]
-    temp_dir_basename = dir_to_check.split('/')[2]
-    [prefix_date_and_id, bc]= [temp_dir_basename.split('_')[i] for i in [0,2]]
-    prefix_date_and_id = prefix_date_and_id[len('dir-'):] # "dir-" to remove
-    #if bc[-1]=='/':
-    #    bc= bc[:-1] # "/" to remove
+    temp_dir_basename = lt.get_last_part_of_path(dir_to_check)
+    prefix_date_and_id, bc = lt.parse_dir_name(temp_dir_basename)[:2]
     print prefix_date_and_id, bc
+
     ####
     # create the specific cluster directory
     ####
index 85b64af..b716deb 100755 (executable)
@@ -35,41 +35,53 @@ class struct_var_set:
         #self.templates={}
         self.barcode = 'NYD'
 
-
+batchsize = 10
 ######
 # MAIN
 ######
-res = struct_var_set()
-
-if (len(sys.argv) == 2):
+if (len(sys.argv) > 1):
     relative_path_to_filtered_reads_file = str(sys.argv[1])
-    path_to_data_file = "../data/"
-    path_to_templates = "../templates/"
-    lt.check_and_create_directory(path_to_templates)
-    filtered_reads_file_basename = relative_path_to_filtered_reads_file.split('/')[-1]
+    if len(sys.argv)>2:
+        analysis_type = '_'+sys.argv[2]
+    else:
+        analysis_type = ''
+    filtered_reads_file = lt.get_last_part_of_path(relative_path_to_filtered_reads_file)
+    path_base = lt.get_base_of_path(relative_path_to_filtered_reads_file)
+    barcode = filtered_reads_file.split('_')[1]
+
+    #create the directory for the given barcode
+    analysis_dir = path_base+'/bc_'+barcode+'_analysis'+analysis_type
+    lt.check_and_create_directory(analysis_dir)
+
+    # create directory for batch 0
+    batch=0
+    temp_pid_files_dir =analysis_dir+'/temp_'+"{0:04d}".format(batch)
+    lt.check_and_create_directory(temp_pid_files_dir)
     dict_pIDs = defaultdict(list)
     
     #import the file containing, for the given barcode, the sequences for each pID
     #generate the pID specific temp files for alignments
     with open(relative_path_to_filtered_reads_file, 'r') as input_file:
-        [prefix_date_and_id, res.barcode] = [filtered_reads_file_basename.split('_')[i] for i in [0,1]]
         count=0
         for record in SeqIO.parse(input_file, 'fasta'):
             pID = str(record.id.split('_')[0])
             dict_pIDs[pID].append((record.id,record.seq))
 
-        #create the directory for the given barcode
-        dir_name_bc = str(path_to_templates+'dir-'+prefix_date_and_id+'_temp_'+res.barcode)
-        lt.check_and_create_directory(dir_name_bc)
-        for pID in dict_pIDs.keys():
+        all_pIDs = sorted(dict_pIDs.keys())
+        for pii, pID in enumerate(all_pIDs):
             # write the temp files for each pID in the corresponding barcode directory
-            with open(dir_name_bc+'/'+prefix_date_and_id+'_temp_'+res.barcode+'_'+pID +'.fasta', 'w') as output_pID_file:
+            with open(temp_pid_files_dir+'/'+ pID+'.fasta', 'w') as output_pID_file:
                 for read in dict_pIDs[pID]:
                     output_pID_file.write(str('>'+read[0]+'\n'))
                     output_pID_file.write(str(read[1]+'\n'))
                     count+=1
                     if(count%500==0):
                         print 'count = ' + str(count)
+            if ((batch+1)*batchsize<pii):
+                batch+=1
+                temp_pid_files_dir =analysis_dir+'/temp_'+"{0:04d}".format(batch)
+                lt.check_and_create_directory(temp_pid_files_dir)
+
         print 'total : ' + str(count)
         #else:
         #    print 'no file created'
diff --git a/src/p2_sort_all.py b/src/p2_sort_all.py
new file mode 100644 (file)
index 0000000..d0f8ae7
--- /dev/null
@@ -0,0 +1,11 @@
+import sys
+import glob
+import subprocess as sp
+
+if len(sys.argv)==2:
+    rundir = sys.argv[1].rstrip('/')+'/'
+    
+    reads_by_barcode = glob.glob(rundir+'bc*.fasta')
+    for bc_file in reads_by_barcode:
+        sp.call(['python', 'src/p2_sort.py',bc_file])
+
index f62fd43..d0dfc74 100755 (executable)
@@ -1,25 +1,19 @@
+#!/ebio/ag-neher/share/programs/bin/python2
 
-#!/usr/bin/python
-#/!\#
-#### /ebio/ag-neher/share/pograms/EPD/bins/python
-#/!\#
 
 ## script that aligns the temp reads files (one per pID/barcode) in the given directory
-## input: directory (for a given barcode) containing the temp reads files to align (in ../templates/)
-## output: aligned reads files (one per pID) contained in a directory "align" (in ../templates/)
-
+## input: directory (for a given barcode) containing the directories named temp_*
+## output: aligned reads files (one per pID) contained in each of these directories
 import numpy as np
 from Bio import SeqIO
 from Bio.Align.Applications import MuscleCommandline
 import os
 import sys
+sys.path.append('./src/')
 import time
 import lib_tools as lt
 import glob
 
-auto_file_name = str(sys.argv[0])
-
-
 ######
 # DEF FUNCTIONS
 ######
@@ -30,40 +24,15 @@ if (len(sys.argv)==2):
     # parse the input directory name
     ####
     relative_path_to_temp_directory = str(sys.argv[1])
-    if relative_path_to_temp_directory[-1]!='/':
-        relative_path_to_temp_directory+='/'
-    path_to_templates='../templates/'
-    temp_directory_basename = relative_path_to_temp_directory.split('/')[-2]
-    print temp_directory_basename
-    [prefix_date_and_id, bc]= [temp_directory_basename.split('_')[i] for i in [0,2]]
-    prefix_date_and_id = prefix_date_and_id[len('dir-'):] # "dir-" to remove
-    print prefix_date_and_id, bc
+    relative_path_to_temp_directory = relative_path_to_temp_directory.rstrip('/')+'/'
 
-    ####
-    # collect the temp reads files to align for the given barcode
-    ####
-    list_temp_files = glob.glob(relative_path_to_temp_directory+'*')
-
-    # list of "pIDs" to generate the pID alignment files 
-    list_of_pIDs = [fname.split('_')[-1].split('.')[0] for fname in list_temp_files]
-    print 'list_of_pIDs created, length: ' + str(len(list_of_pIDs))
-    set_of_pIDs = set(list_of_pIDs) 
-    print 'no duplicates in pIDs list: '+str(len(set_of_pIDs)==len(list_of_pIDs))
-
-    ####
-    # align the reads (temp files) and creates the corresponding aligned files in the align directory (created automatically)
-    ####
-    #create the directory for the aligned reads
-    dir_align_name_bc = str(path_to_templates+'dir-'+prefix_date_and_id+'_align_'+bc)
-    dir_temp_name_bc =  str(path_to_templates+'dir-'+prefix_date_and_id+'_temp_'+bc)
+    list_temp_files = glob.glob(relative_path_to_temp_directory+'*.fasta')
 
-    lt.check_and_create_directory(dir_align_name_bc)
     time_start = time.time()
-    for pID in list_of_pIDs:
-        temp_file_name = dir_temp_name_bc+ '/' +prefix_date_and_id + '_temp_' + bc + '_' + pID + '.fasta'
-        align_file_name=dir_align_name_bc+ '/' +prefix_date_and_id + '_align_'+ bc + '_' + pID + '.fasta'
-        print 'Alignment for file: ' + temp_file_name
-        cline = MuscleCommandline(input = temp_file_name, out = align_file_name)
+    for fname in list_temp_files:
+        align_file_name= lt.trim_extension(fname)+'_aligned.fasta'
+        print 'Alignment for file: ' + fname
+        cline = MuscleCommandline(input = fname, out = align_file_name)
         #
         cline()
         #
diff --git a/src/p3_cluster_align.py b/src/p3_cluster_align.py
new file mode 100755 (executable)
index 0000000..ec3e4f7
--- /dev/null
@@ -0,0 +1,31 @@
+#!/usr/bin/python
+
+import numpy as np
+from Bio import SeqIO
+from Bio.Align.Applications import MuscleCommandline
+import os
+import sys
+import time
+import glob 
+
+####
+# retrieve the temp reads file names to align in the specific cluster directory (in ../templates/) given in input
+# the given cluster directory path must be relative to the current directory (src)
+####
+if(len(sys.argv)==2):
+    analysis_dir = str(sys.argv[1])
+    analysis_dir = analysis_dir.rstrip('/')+'/'
+
+    #collect the list of temp reads files to align
+    list_temp_dirs = glob.glob(analysis_dir+'temp_*')
+    #run one job per file to align on the cluster
+    for temp_dir in list_temp_dirs:
+        cmd = 'qsub -cwd -l h_rt=12:00:00 -l h_vmem=10G ./src/p3_align.py '+temp_dir
+        print cmd
+        os.system(cmd)
+
+else:
+    print auto_file_name+': usage: '+auto_file_name+' <specific cluster directory relative path (in ../templates/)>'
+
+
+
diff --git a/src/p3_cluster_align_aux.py b/src/p3_cluster_align_aux.py
deleted file mode 100755 (executable)
index 197a4e7..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/python
-import os
-import sys
-
-cmd = '/ebio/ag-neher/share/programs/EPD/bin/python2.7 ' + '/ebio/ag-neher/share/users/ebenard/PID_Karolinska_2/src/'
-
-for arg in sys.argv[1:]:
-    cmd+=arg+' '
-
-print cmd
-os.system(cmd)
diff --git a/src/p3_cluster_align_main.py b/src/p3_cluster_align_main.py
deleted file mode 100755 (executable)
index 21170ce..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/python
-
-import numpy as np
-from Bio import SeqIO
-from Bio.Align.Applications import MuscleCommandline
-import os
-import sys
-import time
-
-auto_file_name = str(sys.argv[0])
-
-####
-# retrieve the temp reads file names to align in the specific cluster directory (in ../templates/) given in input
-# the given cluster directory path must be relative to the current directory (src)
-####
-if(len(sys.argv)==2):
-    cluster_dir = str(sys.argv[1])
-    if cluster_dir[-1]!='/':
-        cluster_dir=cluster_dir+'/'
-    path_to_dir = "../templates/"
-    cluster_dir_base_name = cluster_dir[len(path_to_dir):] # "../templates/" to remove
-    
-    #collect the list of temp reads files to align
-    list_temp_files = glob.glob(cluster_dir+'*')
-    print list_temp_files
-    #run one job per file to align on the cluster
-    for cur_file in list_temp_files:
-        cmd = 'qsub -cwd -l h_rt=12:00:00 -l h_vmem=10G ./p3_cluster_align_aux.py ./p3_cluster_align_script.py ' + cur_file
-        #cmd = './p3_cluster_align_aux.py p3_cluster_align_script.py ' + cur_file
-        print cmd
-        os.system(cmd)
-
-else:
-    print auto_file_name+': usage: '+auto_file_name+' <specific cluster directory relative path (in ../templates/)>'
-
-
-
diff --git a/src/p3_cluster_align_script.py b/src/p3_cluster_align_script.py
deleted file mode 100755 (executable)
index 08fe5f2..0000000
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/python
-
-import numpy as np
-from Bio import SeqIO
-from Bio.Align.Applications import MuscleCommandline
-import os
-import sys
-import time
-
-auto_file_name = str(sys.argv[0])
-
-####
-#align the reads of the temp reads file given in input
-####
-if len(sys.argv)==2:
-    relative_path_to_file_to_align = str(sys.argv[1])
-    path_to_templates = "../templates/"
-    [cluster_dir_basename,file_to_align_basename] = [relative_path_to_file_to_align.split('/')[i] for i in [2,3]] # "../templates/cluster-<date+id>/<date+id>_temp_<barcode>_<pID>.fasta"
-    #print cluster_dir_basename
-    [prefix_date_and_id,barcode,pID_to_clean] = [file_to_align_basename.split('_')[i] for i in [0,2,3]] #"<date+id>_temp_<barcode>_<pID>.fasta"
-    pID = pID_to_clean.split('.')[0] # remove the file extension
-
-    print '****** job for the file: ' + relative_path_to_file_to_align
-
-    temp_file_name = relative_path_to_file_to_align
-    align_file_name = path_to_templates+cluster_dir_basename+'/'+prefix_date_and_id+'_align_'+barcode+'_'+pID+'.fasta'
-    cline = MuscleCommandline(input = temp_file_name, out = align_file_name)
-    #print cline
-    #os.system(str('echo '+pID+' > '+align_file_name))
-    time_start = time.time()
-    #
-    cline()
-    #
-    time_end = time.time()
-    
-    print 'time: ' + str(time_end - time_start)
-    
-else:
-    print auto_file_name+': miss the temp reads file to align!'
-    
index cef81e0..d7cb815 100755 (executable)
@@ -8,17 +8,12 @@
 import numpy as np
 from Bio import AlignIO
 from Bio import SeqIO
-from Bio.Alphabet import generic_dna
-from Bio.Align import AlignInfo
-from Bio.Align import MultipleSeqAlignment
 from Bio.SeqRecord import SeqRecord
-from collections import Counter
-from collections import defaultdict
 import os
+import shutil
 import sys
 import glob
 import time
-import datetime
 import lib_tools as lt
 
 auto_file_name = str(sys.argv[0])
@@ -29,12 +24,12 @@ auto_file_name = str(sys.argv[0])
 
 
 ######
-def make_consensus(file_name, pID):
+def make_consensus(file_name):
     #print 'consensus for file: '+file_name
     alignment = AlignIO.read(file_name,"fasta")
-    nb_reads_fwd = np.asarray([int(seq.id.split('_')[1]) for seq in alignment], int)
+    nb_reads_fwd = np.asarray([lt.parse_read_label(seq.id)[1] for seq in alignment], int)
     #print nb_reads_fwd
-    nb_reads_rev = np.asarray([int(seq.id.split('_')[2]) for seq in alignment], int)
+    nb_reads_rev = np.asarray([lt.parse_read_label(seq.id)[2] for seq in alignment], int)
     #print nb_reads_rev
     nb_reads = nb_reads_fwd + nb_reads_rev
 
@@ -60,37 +55,35 @@ def make_consensus(file_name, pID):
 
 
 if __name__=='__main__':
-    if (len(sys.argv)==2):
-        # parse the input aligned files directory name
-        relative_path_to_align_dir=str(sys.argv[1])
-        if(relative_path_to_align_dir[-1]!='/'):
-            relative_path_to_align_dir+='/'
-        path_to_templates = "../templates/"
-        align_dir_basename = relative_path_to_align_dir.split('/')[2]
-        [prefix_date_and_id,file_type,barcode] = [align_dir_basename.split('dir-')[1].split('_')[i] for i in [0,1,2]]
-
-        # create (if necessary) the consensus directory
-        lt.check_and_create_directory(str(path_to_templates+'dir-'+prefix_date_and_id+'_consensus'))
-
-        # get the list of files to consensus (one per pID)
-        list_files_to_consensus = glob.glob(relative_path_to_align_dir+'*')
-        dict_consensus_seq = {}
-        for cur_file in list_files_to_consensus:
-            cur_file_pID = cur_file.split('_')[-1].split('.')[0] #"<prefix_date_and_id>_align_<barcode>_<pID>.fasta"
-            cur_file_barcode = cur_file.split('_')[2]
-            #print cur_file
-            result_consensus_cur_file = make_consensus(cur_file,cur_file_pID)
-            if (result_consensus_cur_file != ('nothing',0,0)): # store the consensus sequence
-                dict_consensus_seq[cur_file_pID]=result_consensus_cur_file
-            #print result_consensus_cur_file
-        # write the consensus sequences of each pID (with #occ >= 3) in the specific consensus seq file for the given barcode
-        with open(str(path_to_templates+'dir-'+prefix_date_and_id+'_consensus/'+prefix_date_and_id+'_consensus_'+barcode+'.fasta'),'w') as out_file:
-            for pID in sorted(dict_consensus_seq.keys()):
-                out_file.write('>'+pID+'_'+str(dict_consensus_seq[pID][1])+'_'+str(dict_consensus_seq[pID][2])+'\n')
-                out_file.write(dict_consensus_seq[pID][0]+'\n')
-
-
-
+    if(len(sys.argv)>1):
+        # parse the input consensus sequences file name
+        analysis_dir=str(sys.argv[1]).rstrip('/')+'/'
+        if len(sys.argv)>2:
+            analysis_type = '_'+sys.argv[2]
+        else:
+            analysis_type = ''
+        
+        temp_directories = glob.glob(analysis_dir+'temp_*')
+        consensus_fname = analysis_dir+'consensus_sequences'+analysis_type+'.fasta'
+        aligned_reads_fname = analysis_dir+'aligned_reads'+analysis_type+'.fasta'
+
+        with open(consensus_fname, 'w') as consensus_file, \
+                open(aligned_reads_fname, 'w') as aligned_reads_file:
+            print temp_directories
+            for temp_dir in temp_directories:
+                pID_files = glob.glob(temp_dir+'/*aligned.fasta')
+                print pID_files
+                for pID_file in pID_files:
+                    pID = lt.get_last_part_of_path(pID_file).split('_')[0]
+                    with open(pID_file, 'r') as infile:
+                        tmp_aln = AlignIO.read(infile, 'fasta')
+                    AlignIO.write(tmp_aln, aligned_reads_file, 'fasta')
+                    consensus_seq = make_consensus(pID_file)
+                    if consensus_seq[1]+consensus_seq[2]>2:
+                        consensus_file.write('>'+lt.read_label(pID, consensus_seq[1], consensus_seq[2])+'\n')
+                        consensus_file.write(consensus_seq[0]+'\n')
+
+                shutil.rmtree(temp_dir)
     else:
         print auto_file_name+': usage: '+auto_file_name+' <aligned reads files directory (in ../templates/)>'
 
index 0356f64..05bd2b7 100755 (executable)
@@ -22,9 +22,12 @@ auto_file_name = str(sys.argv[0])
 ######
 
 
-if(len(sys.argv)==2):
+if(len(sys.argv)>1):
     # parse the input consensus sequences file name
-    relative_path_to_cons_seq_file=str(sys.argv[1])
+    rundir=str(sys.argv[1]).rstrip('/')+'/'
+    if len(sys.argv)>2:
+        analysis_type = '_'+sys.argv[2]
+
     path_to_templates = "../templates/"
     cons_seq_file_basename = lt.get_last_part_of_path(relative_path_to_cons_seq_file)