debug
authorebenard <ebenard@ag-neher-benard.(none)>
Thu, 19 Sep 2013 17:00:33 +0000 (19:00 +0200)
committerebenard <ebenard@ag-neher-benard.(none)>
Thu, 19 Sep 2013 17:00:33 +0000 (19:00 +0200)
src/p3_cluster_bis_clean_logs_and_move_back_aligned_files.py
src/p5_gather_aligned_reads.py
src/p6_align_global_per_barcode.py
src/p7_detect_mutants_indels.py
src/p8_decontamination.py

index a70a23d..6d97210 100755 (executable)
@@ -8,6 +8,7 @@
 import os
 import sys
 import time
+import glob
 from collections import Counter
 from collections import defaultdict
 import lib_tools as lt
@@ -17,47 +18,52 @@ auto_file_name = str(sys.argv[0])
         
 ######
 
-if (len(sys.argv)==2):
+if __name__=='__main__':
 
-    # 1. clean the cluster logs in src directory (normally the current directory)
-    os.system('rm -f p3_cluster_align_aux.py.e* p3_cluster_align_aux.py.o*')
-    print 'cluster logs p3_cluster_align_aux_py.e* and p3_cluster_align_aux.py.o* deleted'
+    if (len(sys.argv)==2):
 
-    # 2. move the temp and aligned reads files from the cluster directory to their specific temp/align directory
-    relative_path_to_cluster_dir = str(sys.argv[1])
-    if relative_path_to_cluster_dir[-1]!='/':
-        relative_path_to_cluster_dir+='/'
-    path_to_templates = "../templates/"
-    prefix_date_and_id=relative_path_to_cluster_dir.split('/')[2].split('cluster-')[1]
-    print prefix_date_and_id
+        # 1. clean the cluster logs in src directory (normally the current directory)
+        os.system('rm -f p3_cluster_align_aux.py.e* p3_cluster_align_aux.py.o*')
+        print 'cluster logs p3_cluster_align_aux_py.e* and p3_cluster_align_aux.py.o* deleted'
+
+        # 2. move the temp and aligned reads files from the cluster directory to their specific temp/align directory
+        relative_path_to_cluster_dir = str(sys.argv[1])
+        if relative_path_to_cluster_dir[-1]!='/':
+            relative_path_to_cluster_dir+='/'
+
+        path_to_templates = "../templates/"
+        prefix_date_and_id=relative_path_to_cluster_dir.split('/')[2].split('cluster-')[1]
+        print prefix_date_and_id
         
-    list_files = os.popen('ls '+relative_path_to_cluster_dir+'*').readlines()
-    total_nb_files_in_cluster_dir = len(list_files)
-    print total_nb_files_in_cluster_dir
-
-    count_files = Counter()
-    dict_files = defaultdict(list)
-
-    for cur_file in list_files:
-        cur_file = cur_file.strip()
-        cur_file_base_name = cur_file.split('/')[-1]
-        cur_file_type, cur_file_bc = [cur_file_base_name.split('_')[i] for i in [1,2]]
-        count_files[cur_file_type]+=1
-        dict_files[(cur_file_type,cur_file_bc)].append(cur_file_base_name)
-    print '#files in cluster directory: '+str(count_files)
-    #print dict_files
-    
-    # move the files to their directory
-    for type_and_bc in dict_files.keys():
-        # create the aligned files directory if necessary
-        new_file_directory = str(path_to_templates+'dir-'+prefix_date_and_id+'_'+type_and_bc[0]+'_'+type_and_bc[1]+'/')
-        lt.check_and_create_directory(new_file_directory)
-        for cur_file in dict_files[type_and_bc]:            
-            print 'move file '+relative_path_to_cluster_dir+cur_file+' to the directory: '+new_file_directory
-            os.system('mv '+relative_path_to_cluster_dir+cur_file+' '+new_file_directory)
-            count_files[type_and_bc[0]]-=1
-    print '#remaining files in cluster directory: '+str(count_files)
+        #list_files = os.popen('ls '+relative_path_to_cluster_dir+'*').readlines()
+        list_files = glob.glob(relative_path_to_cluster_dir+'*') 
+        total_nb_files_in_cluster_dir = len(list_files)
+        print total_nb_files_in_cluster_dir
+
+        count_files = Counter()
+        dict_files = defaultdict(list)
+        
+        for cur_file in list_files:
+            cur_file = cur_file.strip()
+            cur_file_base_name = cur_file.split('/')[-1]
+            cur_file_type, cur_file_bc = [cur_file_base_name.split('_')[i] for i in [1,2]]
+            count_files[cur_file_type]+=1
+            dict_files[(cur_file_type,cur_file_bc)].append(cur_file_base_name)
+
+        print '#files in cluster directory: '+str(count_files)
+        # print dict_files
     
+        # move the files to their directory
+        for type_and_bc in dict_files.keys():
+            # create the aligned files directory if necessary
+            new_file_directory = str(path_to_templates+'dir-'+prefix_date_and_id+'_'+type_and_bc[0]+'_'+type_and_bc[1]+'/')
+            lt.check_and_create_directory(new_file_directory)
+            for cur_file in dict_files[type_and_bc]:            
+                print 'move file '+relative_path_to_cluster_dir+cur_file+' to the directory: '+new_file_directory
+                os.system('mv '+relative_path_to_cluster_dir+cur_file+' '+new_file_directory)
+                count_files[type_and_bc[0]]-=1
 
-else:
-    print auto_file_name+': usage: '+auto_file_name+' <cluster directory (in ../templates/)>'
+        print '#remaining files in cluster directory: '+str(count_files)
+    
+    else:
+        print auto_file_name+': usage: '+auto_file_name+' <cluster directory (in ../templates/)>'
index 3fc3fd3..9b7e11b 100755 (executable)
@@ -3,7 +3,8 @@
 
 #
 # script that gathers all the aligned reads for a given barcode in one file
-#
+# input: aligned files directory (in ../templates/)
+# output: gathered aligned reads files in a specific 'all-align-in-one-file' directory (in ../templates/)
 
 import numpy as np
 from Bio import AlignIO
@@ -18,44 +19,36 @@ import os
 import sys
 import time
 import datetime
+import glob
 import lib_tools as lt
 
 auto_file_name = str(sys.argv[0])
 
-######
-# DEF FUNCTIONS
-######
-
-
-if (len(sys.argv)==2):
-    # parse the input aligned files directory name
-    relative_path_to_align_dir=str(sys.argv[1])
-    if(relative_path_to_align_dir[-1]!='/'):
-        relative_path_to_align_dir+='/'
-    path_to_templates = "../templates/"
-    align_dir_basename = relative_path_to_align_dir.split('/')[2]
-    [prefix_date_and_id,file_type,barcode] = [align_dir_basename.split('dir-')[1].split('_')[i] for i in [0,1,2]]
-
-    # create (if necessary) the all-align-in-one-file directory
-    lt.check_and_create_directory(str(path_to_templates+'dir-'+prefix_date_and_id+'_all-align-in-one-file'))
-    
-    # get the list of files to gather 
-    list_files_to_gather = os.popen(str('ls '+relative_path_to_align_dir+'*')).readlines()
-    list_files_to_gather = [file_to_gather.strip() for file_to_gather in list_files_to_gather]
-    
+if __name__=='__main__':
+    if (len(sys.argv)==2):
+        # parse the input aligned files directory name
+        relative_path_to_align_dir=str(sys.argv[1])
+        if(relative_path_to_align_dir[-1]!='/'):
+            relative_path_to_align_dir+='/'
+        path_to_templates = "../templates/"
+        align_dir_basename = relative_path_to_align_dir.split('/')[2]
+        [prefix_date_and_id,file_type,barcode] = [align_dir_basename.split('dir-')[1].split('_')[i] for i in [0,1,2]]
+
+        # create (if necessary) the all-align-in-one-file directory
+        lt.check_and_create_directory(str(path_to_templates+'dir-'+prefix_date_and_id+'_all-align-in-one-file'))
     
-    global_list_alignments = []
+        # get the list of files to gather 
+        list_files_to_gather = glob.glob(relative_path_to_align_dir+'*')
     
-    for i in range(len(list_files_to_gather)):
-        list_seq_cur_file = []
-        input_file_name = list_files_to_gather[i]
-        for seq_record in SeqIO.parse(input_file_name, 'fasta'):
-            list_seq_cur_file.append(seq_record)
-        
-        cur_alignment = MultipleSeqAlignment(list_seq_cur_file)
-        global_list_alignments.append(cur_alignment)
-    AlignIO.write(global_list_alignments, str(path_to_templates+'dir-'+prefix_date_and_id+'_all-align-in-one-file/'+prefix_date_and_id+'_all-align-in-one-file_'+barcode+'.fasta'), 'fasta')
-       
+        global_list_alignments = []
     
-else:
-    print auto_file_name+': usage: '+auto_file_name+' <aligned reads files directory (in ../templates/)>'
+        for i in range(len(list_files_to_gather)):
+            list_seq_cur_file = []
+            input_file_name = list_files_to_gather[i]
+            for seq_record in SeqIO.parse(input_file_name, 'fasta'):
+                list_seq_cur_file.append(seq_record)
+            cur_alignment = MultipleSeqAlignment(list_seq_cur_file)
+            global_list_alignments.append(cur_alignment)
+        AlignIO.write(global_list_alignments, str(path_to_templates+'dir-'+prefix_date_and_id+'_all-align-in-one-file/'+prefix_date_and_id+'_all-align-in-one-file_'+barcode+'.fasta'), 'fasta')
+    else:
+        print auto_file_name+': usage: '+auto_file_name+' <aligned reads files directory (in ../templates/)>'
index 6c6dc19..7fb65e4 100755 (executable)
@@ -17,35 +17,32 @@ import lib_tools as lt
 
 auto_file_name = str(sys.argv[0])
 
-######
-# DEF FUNCTIONS
-######
 
-
-if(len(sys.argv)==2):
-    # parse the input consensus sequences file name
-    relative_path_to_cons_seq_file=str(sys.argv[1])
-    path_to_templates = "../templates/"
-    cons_seq_file_basename = relative_path_to_cons_seq_file.split('/')[3]
-    
-    [prefix_date_and_id,file_type,barcode] = [cons_seq_file_basename.split('_')[i] for i in [0,1,2]]
-    barcode = barcode.split('.')[0] # remove file extension ".fasta"
-
-    # create (if necessary) the consensus directory
-    lt.check_and_create_directory(str(path_to_templates+'dir-'+prefix_date_and_id+'_align-global'))
+if __name__=='__main__':
+    if(len(sys.argv)==2):
+        # parse the input consensus sequences file name
+        relative_path_to_cons_seq_file=str(sys.argv[1])
+        path_to_templates = "../templates/"
+        cons_seq_file_basename = relative_path_to_cons_seq_file.split('/')[3]
     
-    # align
-    aligned_cons_seq_file_name = str(path_to_templates+'dir-'+prefix_date_and_id+'_align-global/'+prefix_date_and_id+'_align-global_'+barcode+'.fasta') 
-    print 'Global consensus sequences alignment for barcode: ' + barcode + ' (from file: ' + relative_path_to_cons_seq_file + ' )'
+        [prefix_date_and_id,file_type,barcode] = [cons_seq_file_basename.split('_')[i] for i in [0,1,2]]
+        barcode = barcode.split('.')[0] # remove file extension ".fasta"
 
-    cline = MuscleCommandline(input = relative_path_to_cons_seq_file, out = aligned_cons_seq_file_name)
+        # create (if necessary) the consensus directory
+        lt.check_and_create_directory(str(path_to_templates+'dir-'+prefix_date_and_id+'_align-global'))
     
-    time_start = time.time()
-    #
-    cline()
-    #
-    time_end = time.time()
-    print 'alignment computation time: ' + str(time_end - time_start)
-
-else:
-    print auto_file_name + ': usage: '+ auto_file_name + ' <consensus sequences file (in ../templates/dir-<date_and_id>_consensus)>'
+        # align
+        aligned_cons_seq_file_name = str(path_to_templates+'dir-'+prefix_date_and_id+'_align-global/'+prefix_date_and_id+'_align-global_'+barcode+'.fasta') 
+        print 'Global consensus sequences alignment for barcode: ' + barcode + ' (from file: ' + relative_path_to_cons_seq_file + ' )'
+        
+        cline = MuscleCommandline(input = relative_path_to_cons_seq_file, out = aligned_cons_seq_file_name)
+        
+        time_start = time.time()
+        #
+        cline()
+        #
+        time_end = time.time()
+        print 'alignment computation time: ' + str(time_end - time_start)
+
+    else:
+        print auto_file_name + ': usage: '+ auto_file_name + ' <consensus sequences file (in ../templates/dir-<date_and_id>_consensus)>'
index c1e8585..6e285a3 100755 (executable)
@@ -7,7 +7,7 @@
 #        - consensus sequences file (in ../templates/dir-<date_and_id>_consensus/)
 # 
 # output: filtered reads file (in ../data/) resolving the pID mutations, new <date_and_id> = <date_and_id>-mutant-indels
-#         new read id: <pID-parent>_<nb-reads-fwd>_<nb-reads-rev>_<pID-origin>  
+#         new read id: <pID-parent>_<nb-reads-fwd>_<nb-reads-rev>_<pID-origin>[_m if a mutation occurred]  
 ####
 
 import numpy as np
@@ -40,243 +40,235 @@ MIN_OCC_MOST_ABUND_PID = 10
 ########
 
 
-
-if (len(sys.argv) == 3):
-    # parse the input data files names
-    relative_path_to_filtered_reads_file = str(sys.argv[1])
-    relative_path_to_cons_seq_file = str(sys.argv[2])
+if __name__=='__main__':
+    if (len(sys.argv) == 3):
+        # parse the input data files names
+        relative_path_to_filtered_reads_file = str(sys.argv[1])
+        relative_path_to_cons_seq_file = str(sys.argv[2])
     
-    path_to_data = '../data/'
-    path_to_templates = '../templates/'
+        path_to_data = '../data/'
+        path_to_templates = '../templates/'
     
-    filtered_reads_file_basename = relative_path_to_filtered_reads_file.split('/')[2]
-    cons_seq_file_basename = relative_path_to_cons_seq_file.split('/')[3]
+        filtered_reads_file_basename = relative_path_to_filtered_reads_file.split('/')[2]
+        cons_seq_file_basename = relative_path_to_cons_seq_file.split('/')[3]
 
-    [prefix_date_and_id, barcode] = [filtered_reads_file_basename.split('_')[i] for i in [0,1]]
-    [prefix_date_and_id_cs, barcode_cs] = [cons_seq_file_basename.split('_')[i] for i in [0,2]]
-    barcode_cs = barcode_cs.split('.')[0] # remove ".fasta"
+        [prefix_date_and_id, barcode] = [filtered_reads_file_basename.split('_')[i] for i in [0,1]]
+        [prefix_date_and_id_cs, barcode_cs] = [cons_seq_file_basename.split('_')[i] for i in [0,2]]
+        barcode_cs = barcode_cs.split('.')[0] # remove ".fasta"
 
-    # check the similarity
-    if ((barcode == barcode_cs) and (prefix_date_and_id == prefix_date_and_id_cs)):
-      
-        # 1. Create the dictionary of pIDs (from the output files of p1_trim_and_filter)
+        # check the barcode similarity
+        if ((barcode == barcode_cs) and (prefix_date_and_id == prefix_date_and_id_cs)):
         
-        # dictionary containing the list of reads (values) associated to a pID (keys)
-        dict_all_reads = defaultdict(list)
-
-        # dictionary containing the consensus sequence (values) associated to a pID (keys) if it occurs at least 3 times
-        dict_cons_seq = defaultdict(list)
-
-        count_reads_with_invalid_pID = 0
-        count_reads_added_to_dict_all_reads = 0
-
-        ###################################
-        # -- READ THE FILTERED READS FILE #
-        ###################################
-        with open(relative_path_to_filtered_reads_file, 'r') as filtered_reads_file:
-            # for all the valid reads (without Ns), add the read to dict_all_reads
-            for record in SeqIO.parse(filtered_reads_file, 'fasta'):
-                [pID,nb_reads_fwd,nb_reads_rev] = [record.id.split('_')[i] for i in [0,1,2]]
-                nb_reads_fwd = int(nb_reads_fwd)
-                nb_reads_rev = int(nb_reads_rev)
-                # (pID,identity,direction) = (str(record.id.split('_')[0]),str(record.id.split('_')[1]),str(record.id.split('_')[2]))
-                seq = str(record.seq)
-                if (pID.count('N') == 0): # if the pID is not ambiguous
-                    dict_all_reads[pID].append([nb_reads_fwd,nb_reads_rev,seq])
-                    count_reads_added_to_dict_all_reads += (nb_reads_fwd+nb_reads_rev)
-                else:
-                    #print 'Invalid pID: ' + pID
-                    count_reads_with_invalid_pID += (nb_reads_fwd+nb_reads_rev)
-
-        # control prints
-        ####
-        print 'count_reads_with_invalid_pID: '+str(count_reads_with_invalid_pID)
-        print 'count_reads_added_to_dict_all_reads: '+str(count_reads_added_to_dict_all_reads)
-        print '#pIDs different: '+str(len(dict_all_reads.keys()))
+            # Create the dictionary of pIDs (from the output files of p1_trim_and_filter)
         
-        count_pIDs_occ = Counter()
-            
-        for pID in dict_all_reads.keys():
-            nb_reads_diff = len(dict_all_reads[pID])
-            for i in range(nb_reads_diff):
-                nb_cur_reads = sum([dict_all_reads[pID][i][j] for j in [0,1]])
+            # dictionary containing the list of reads (values) associated to a pID (keys)
+            dict_all_reads = defaultdict(list)
+
+            # dictionary containing the consensus sequence (values) associated to a pID (keys) if it occurs at least 3 times
+            dict_cons_seq = defaultdict(list)
+
+            count_reads_with_invalid_pID = 0
+            count_reads_added_to_dict_all_reads = 0
+
+            ###################################
+            # -- READ THE FILTERED READS FILE #
+            ###################################
+            with open(relative_path_to_filtered_reads_file, 'r') as filtered_reads_file:
+                # for all the valid reads (without Ns), add the read to dict_all_reads
+                for record in SeqIO.parse(filtered_reads_file, 'fasta'):
+                    [pID,nb_reads_fwd,nb_reads_rev] = [record.id.split('_')[i] for i in [0,1,2]]
+                    nb_reads_fwd = int(nb_reads_fwd)
+                    nb_reads_rev = int(nb_reads_rev)
+                    seq = str(record.seq)
+                    if (pID.count('N') == 0): # if the pID is not ambiguous
+                        dict_all_reads[pID].append([nb_reads_fwd,nb_reads_rev,seq])
+                        count_reads_added_to_dict_all_reads += (nb_reads_fwd+nb_reads_rev)
+                    else:
+                        # print 'Invalid pID: ' + pID
+                        count_reads_with_invalid_pID += (nb_reads_fwd+nb_reads_rev)
+
+            # control prints
+            # ###
+            print 'count_reads_with_invalid_pID: '+str(count_reads_with_invalid_pID)
+            print 'count_reads_added_to_dict_all_reads: '+str(count_reads_added_to_dict_all_reads)
+            print '#pIDs distinct: '+str(len(dict_all_reads.keys()))
+        
+            count_pIDs_occ = Counter()
+        
+            for pID in dict_all_reads.keys():
+                nb_reads_diff = len(dict_all_reads[pID]) # = nb of distinct reads for the given pID
+                nb_cur_reads = 0
+                for i in range(nb_reads_diff):
+                    nb_cur_reads += sum([dict_all_reads[pID][i][j] for j in [0,1]])
                 count_pIDs_occ[((nb_cur_reads >= 3) and '3+') or ((nb_cur_reads == 2) and '2') or ('1')] += 1
 
-        print '#pIDs occurring only once: ' + str(count_pIDs_occ['1'])#str(len(list_pIDs_occurring_once))
-        print '#pIDs occurring only twice: ' + str(count_pIDs_occ['2'])#str(len(list_pIDs_occurring_twice))
-        print '#pIDs occurring at least 3 times: ' + str(count_pIDs_occ['3+'])#str(len(list_pIDs_occurring_at_least_3_times))
-        ####
+            print '#pIDs occurring only once: ' + str(count_pIDs_occ['1'])#str(len(list_pIDs_occurring_once))
+            print '#pIDs occurring only twice: ' + str(count_pIDs_occ['2'])#str(len(list_pIDs_occurring_twice))
+            print '#pIDs occurring at least 3 times: ' + str(count_pIDs_occ['3+'])#str(len(list_pIDs_occurring_at_least_3_times))
 
         
-        ########################################
-        # -- READ THE CONSENSUS SEQUENCES FILE #
-        ########################################
-        # normally, this reading of the consensus file should not modify the list of pIDs (keys of dict_all_reads)
-        with open(relative_path_to_cons_seq_file, 'r') as cons_seq_file:
-            for record in SeqIO.parse(cons_seq_file, 'fasta'):
-                [pID,nb_occ_fwd,nb_occ_rev] = [record.id.split('_')[i] for i in [0,1,2]]
-                nb_occ_fwd = int(nb_occ_fwd)
-                nb_occ_rev = int(nb_occ_rev)
-                cons_seq = str(record.seq)
-                #if (pID in dict_all_reads.keys()): # should always be the case
-                #    if (len(dict_all_reads[pID]) > 2): # if the pID occurs more than twice
-                # 
-                #else:
-                #    print 'ERROR: pID ' + pID + ' present in the consensus sequences file but not in the filtered reads file !!!'
-                dict_cons_seq[pID].append([nb_occ_fwd,nb_occ_rev,cons_seq])
-        print 'From consensus seq. file :'
-        print '#pIDs occurring at least 3 times: ' + str(len(dict_cons_seq.keys()))
+            ########################################
+            # -- READ THE CONSENSUS SEQUENCES FILE #
+            ########################################
+            # normally, this reading of the consensus file should not modify the list of pIDs (keys of dict_all_reads)
+            with open(relative_path_to_cons_seq_file, 'r') as cons_seq_file:
+                for record in SeqIO.parse(cons_seq_file, 'fasta'):
+                    [pID,nb_occ_fwd,nb_occ_rev] = [record.id.split('_')[i] for i in [0,1,2]]
+                    nb_occ_fwd = int(nb_occ_fwd)
+                    nb_occ_rev = int(nb_occ_rev)
+                    cons_seq = str(record.seq)
+                    dict_cons_seq[pID].append([nb_occ_fwd,nb_occ_rev,cons_seq])
+            print 'From consensus seq. file :'
+            print '#pIDs occurring at least 3 times: ' + str(len(dict_cons_seq.keys()))
         
 
-        ###########################
-        # -- SEARCH THE NEIGHBORS #
-        ###########################
-        # dictionary indicating the "neighbor" state of each pID
-        dict_neighbor_state_pIDs = {}
-        for pID in dict_all_reads.keys():
-            dict_neighbor_state_pIDs[pID] = False
+            ###########################
+            # -- SEARCH THE NEIGHBORS #
+            ###########################
+            # dictionary indicating the "neighbor" state of each pID
+            dict_neighbor_state_pIDs = {}
+            count_nb_pIDs_occ_sup_eq_3_in_filtered_reads_file = 0
+            for pID in dict_all_reads.keys():
+                dict_neighbor_state_pIDs[pID] = False
         
-        # sorted list of the #occurrences of each pID: couples (#occ, pID)
-        list_occ_pIDs_dec = [(sum([dict_cons_seq[pID][i][0]+dict_cons_seq[pID][i][1] for i in range(len(dict_cons_seq[pID]))]), pID) for pID in dict_cons_seq.keys()]
-        list_occ_pIDs_inc = [(sum([dict_cons_seq[pID][i][0]+dict_cons_seq[pID][i][1] for i in range(len(dict_cons_seq[pID]))]), pID) for pID in dict_cons_seq.keys()]
-        #list_occ_pIDs_inc = [(dict_cons_seq[pID][0][0]), pID) for pID in dict_cons_seq.keys()]
+                # sorted list of the #occurrences of each pID: couples (#occ, pID)
+                list_occ_pIDs_dec = [(sum([dict_cons_seq[pID][i][0]+dict_cons_seq[pID][i][1] for i in range(len(dict_cons_seq[pID]))]), pID) for pID in dict_cons_seq.keys()]
+                list_occ_pIDs_inc = [(sum([dict_cons_seq[pID][i][0]+dict_cons_seq[pID][i][1] for i in range(len(dict_cons_seq[pID]))]), pID) for pID in dict_cons_seq.keys()]
         
-        # -- add the pIDs with #occurrences < 3 from dict_all_reads to list_occ_pIDs
-        for pID in dict_all_reads.keys():
-            nb_occ_cur_pID = 0
-            for i in range(len(dict_all_reads[pID])):
-                nb_occ_cur_pID += sum([dict_all_reads[pID][i][j] for j in [0,1]]) # add nb_occ_fwd and nb_occ_rev
-            if nb_occ_cur_pID < 3:
-                list_occ_pIDs_dec.append((nb_occ_cur_pID,pID))
-                list_occ_pIDs_inc.append((nb_occ_cur_pID,pID))
-        list_occ_pIDs_dec.sort(key = lambda couple: couple[0], reverse=True)
-        list_occ_pIDs_inc.sort(key = lambda couple: couple[0])
-
-        print 'Check list_occ_pIDs_dec and list_occ_pIDs_inc lengths (should be equal): '        
-        print len(list_occ_pIDs_dec)
-        print len(list_occ_pIDs_inc)
-         
-
-        # dictionary containing the parent pID (values) of a given pID (keys)
-        dict_neighbor_of = defaultdict(list)
+            # -- add the pIDs with #occurrences < 3 from dict_all_reads to list_occ_pIDs
+            for pID in dict_all_reads.keys():
+                nb_occ_cur_pID = 0
+                for i in range(len(dict_all_reads[pID])):
+                    nb_occ_cur_pID += sum([dict_all_reads[pID][i][j] for j in [0,1]]) # add nb_occ_fwd and nb_occ_rev
+                if nb_occ_cur_pID < 3:
+                    list_occ_pIDs_dec.append((nb_occ_cur_pID,pID))
+                    list_occ_pIDs_inc.append((nb_occ_cur_pID,pID))
+                else:
+                    count_nb_pIDs_occ_sup_eq_3_in_filtered_reads_file += 1
+                list_occ_pIDs_dec.sort(key = lambda couple: couple[0], reverse=True)
+                list_occ_pIDs_inc.sort(key = lambda couple: couple[0])
+
+            print 'Check list_occ_pIDs_dec and list_occ_pIDs_inc lengths (should be equal): '        
+            print len(list_occ_pIDs_dec)
+            print len(list_occ_pIDs_inc)
+        
+            print '#####\n#pIDs with occ >=3 in filtered reads file: '+str(count_nb_pIDs_occ_sup_eq_3_in_filtered_reads_file)
+            print '# pIDs (with occ >= 3) in consensus seq. file: '+str(len(dict_cons_seq.keys()))
+            
+
+            # dictionary containing the parent pID (values) of a given pID (keys)
+            dict_neighbor_of = defaultdict(list)
         
-        lim_pID_a = [list_occ_pIDs_inc[i][0]>=MIN_OCC_MOST_ABUND_PID for i in range(len(list_occ_pIDs_inc))].count(True)
-        print 'lim_pID_a:'+str(lim_pID_a)
-        print 'nb occ pID_a at pos lim_pID_a-1 in list_occ_pIDs_dec: '+str(list_occ_pIDs_dec[lim_pID_a-1][0])
-        print 'nb occ pID_a at pos lim_pID_a in list_occ_pIDs_dec: '+str(list_occ_pIDs_dec[lim_pID_a][0])
-
-        for pID_a in list_occ_pIDs_dec[:lim_pID_a]:
-            if dict_neighbor_state_pIDs[pID_a[1]] == False:
-                for ii,pID_r in enumerate(list_occ_pIDs_inc[:list_occ_pIDs_inc.index(pID_a)]):
-                     if((dict_neighbor_state_pIDs[pID_r[1]] == False) and (pID_r[0] < pID_a[0])):
-                        pIDs_align_score = align.localms(pID_a[1], pID_r[1], 1, 0, -0.6, -0.3)
-                        if (len(pIDs_align_score) != 0): # if pID_a[1] and pID_r[1] can be aligned
-                            if (pIDs_align_score[0][2] >= 8.4):
-                                print 'pID_a: '+str(pID_a)+', pID_r: '+str(pID_r)
-                                print pIDs_align_score[0][0], pIDs_align_score[0][1]
-                                print dict_neighbor_state_pIDs[pID_a[1]], dict_neighbor_state_pIDs[pID_r[1]]
-                                print "possible neighbors"
+            lim_pID_a = [list_occ_pIDs_inc[i][0]>=MIN_OCC_MOST_ABUND_PID for i in range(len(list_occ_pIDs_inc))].count(True)
+            print 'lim_pID_a:'+str(lim_pID_a)
+            print 'nb occ pID_a at pos lim_pID_a-1 in list_occ_pIDs_dec: '+str(list_occ_pIDs_dec[lim_pID_a-1][0])
+            print 'nb occ pID_a at pos lim_pID_a in list_occ_pIDs_dec: '+str(list_occ_pIDs_dec[lim_pID_a][0])
+
+            for pID_a in list_occ_pIDs_dec[:lim_pID_a]:
+                if dict_neighbor_state_pIDs[pID_a[1]] == False:
+                    for ii,pID_r in enumerate(list_occ_pIDs_inc[:list_occ_pIDs_inc.index(pID_a)]):
+                        if((dict_neighbor_state_pIDs[pID_r[1]] == False) and (pID_r[0] < pID_a[0])):
+                            pIDs_align_score = align.localms(pID_a[1], pID_r[1], 1, 0, -0.6, -0.3)
+                            if (len(pIDs_align_score) != 0): # if pID_a[1] and pID_r[1] can be aligned
+                                if (pIDs_align_score[0][2] >= len(pID_a[1])-1-0.6):
+                                    print 'pID_a: '+str(pID_a)+', pID_r: '+str(pID_r)
+                                    print pIDs_align_score[0][0], pIDs_align_score[0][1]
+                                    print dict_neighbor_state_pIDs[pID_a[1]], dict_neighbor_state_pIDs[pID_r[1]]
+                                    print "possible neighbors"
                     
-                                if (pID_a[0] >= MIN_OCC_MOST_ABUND_PID): # if pID_a occurs enough times to have neighbors
-                                    if (pID_r[0] != 2): # pID_r occurs once or more than twice: only one sequence comparison to do
-                                        seq_a = dict_cons_seq[pID_a[1]][0][2] # [nb_fwd,nb_rev,seq]
-                                        if(pID_r[0] == 1): # pID_r occurs once: its sequence is in dict_all_reads
-                                            # seq_r = dict_all_reads[pID_r[0]][0][2]
-                                            seq_r = dict_all_reads[pID_r[1]][0][2] # [nb_fwd,nb_rev,seq]
-                                        else: # pID_r occurs at least 3 times: its consensus sequence is in dict_cons_seq
-                                            seq_r = dict_cons_seq[pID_r[1]][0][2] # [nb_fwd,nb_rev,seq]
-                                        score = align.localms(seq_r, seq_a, 1, 0, -1, -1)[0]
-                                        print 'Alignment score: '+ str(score[2])
-                                        print score[0]
-                                        print score[1]
-                                        if(score[2] >= len(seq_a) - DIST_MAX):
-                                            dict_neighbor_of[pID_r[1]] = pID_a[1]
-                                            dict_neighbor_state_pIDs[pID_r[1]] = True
-                                            print "NEIGHBORS !"
-
-                                    else: # pID_r occurs twice: comparison of sequence of pID_a with its 2 sequences in dict_all_reads
-                                        seq_a = dict_cons_seq[pID_a[1]][0][2] # [nb_fwd,nb_rev,seq]
-                                        seq_r_0 = dict_all_reads[pID_r[1]][0][2] # [nb_fwd,nb_rev,seq]
-                                        seq_r_1 = dict_all_reads[pID_r[1]][1][2] # [nb_fwd,nb_rev,seq]
-                                        score_0 = align.localms(seq_r_0, seq_a, 1, 0, -1, -1)[0]
-                                        score_1 = align.localms(seq_r_1, seq_a, 1, 0, -1, -1)[0]
-                                        print 'Alignment score_0: '+ str(score_0[2])
-                                        print score_0[0]
-                                        print score_0[1]
-                                        print 'Alignment score_1: '+ str(score_1[2])
-                                        print score_1[0]
-                                        print score_1[1]
-                                        if((score_0[2] >= len(seq_a) - DIST_MAX) or (score_1[2] >= len(seq_a) - DIST_MAX)):
-                                            dict_neighbor_of[pID_r[1]] = pID_a[1]
-                                            dict_neighbor_state_pIDs[pID_r[1]] = True
-                                            print "NEIGHBORS !"
-                                else:
-                                    print 'PB: pID_a does not have the good number of occurrences !!!'
+                                    if (pID_a[0] >= MIN_OCC_MOST_ABUND_PID): # if pID_a occurs enough times to have neighbors
+                                        if (pID_r[0] != 2 or (pID_r[0]==2 and len(dict_all_reads[pID_r[1]])==1)): # pID_r occurs once or more than twice or twice with the same read: only one sequence comparison to do
+                                            seq_a = dict_cons_seq[pID_a[1]][0][2] # [nb_fwd,nb_rev,seq]
+                                            if(pID_r[0] <= 2): # pID_r occurs once or twice: its sequence is in dict_all_reads
+                                                seq_r = dict_all_reads[pID_r[1]][0][2] # [nb_fwd,nb_rev,seq]
+                                            else: # pID_r occurs at least 3 times: its consensus sequence is in dict_cons_seq
+                                                seq_r = dict_cons_seq[pID_r[1]][0][2] # [nb_fwd,nb_rev,seq]
+                                            score = align.localms(seq_r, seq_a, 1, 0, -1, -1)[0]
+                                            print 'Alignment score: '+ str(score[2])
+                                            print score[0]
+                                            print score[1]
+                                            if(score[2] >= len(seq_a) - DIST_MAX):
+                                                dict_neighbor_of[pID_r[1]] = pID_a[1]
+                                                dict_neighbor_state_pIDs[pID_r[1]] = True
+                                                print "NEIGHBORS !"
+
+                                        else: # pID_r occurs twice, occurrences with distinct reads: comparison of sequence of pID_a with its 2 sequences in dict_all_reads
+                                            seq_a = dict_cons_seq[pID_a[1]][0][2] # [nb_fwd,nb_rev,seq]
+                                            seq_r_0 = dict_all_reads[pID_r[1]][0][2] # [nb_fwd,nb_rev,seq]
+                                            seq_r_1 = dict_all_reads[pID_r[1]][1][2] # [nb_fwd,nb_rev,seq]
+                                            score_0 = align.localms(seq_r_0, seq_a, 1, 0, -1, -1)[0]
+                                            score_1 = align.localms(seq_r_1, seq_a, 1, 0, -1, -1)[0]
+                                            print 'Alignment score_0: '+ str(score_0[2])
+                                            print score_0[0]
+                                            print score_0[1]
+                                            print 'Alignment score_1: '+ str(score_1[2])
+                                            print score_1[0]
+                                            print score_1[1]
+                                            if((score_0[2] >= len(seq_a) - DIST_MAX) or (score_1[2] >= len(seq_a) - DIST_MAX)):
+                                                dict_neighbor_of[pID_r[1]] = pID_a[1]
+                                                dict_neighbor_state_pIDs[pID_r[1]] = True
+                                                print "NEIGHBORS !"
+                                    else:
+                                        print 'PB: pID_a does not have the good number of occurrences !!!'
 
                 
-        dict_neighbors = defaultdict(list)
-        for pID in dict_neighbor_of.keys():
-            dict_neighbors[dict_neighbor_of[pID]].append(pID)
+            dict_neighbors = defaultdict(list)
+            for pID in dict_neighbor_of.keys():
+                dict_neighbors[dict_neighbor_of[pID]].append(pID)
         
-        print 'neighbors:'
-        nb_neighbors_stated = 0
-        for pID in dict_neighbor_state_pIDs.keys():
-            if dict_neighbor_state_pIDs[pID]==True:
-                nb_neighbors_stated += 1
-                print pID
+            print 'neighbors:'
+            nb_neighbors_stated = 0
+            for pID in dict_neighbor_state_pIDs.keys():
+                if dict_neighbor_state_pIDs[pID]==True:
+                    nb_neighbors_stated += 1
+                    print pID
 
         
         
-        ##############################################
-        # -- WRITE THE NEIGHBORS FILTERED READS FILE #
-        ##############################################
-        count_reads_written = 0
-        count_neighbors_reads_written = 0
+            ##############################################
+            # -- WRITE THE NEIGHBORS FILTERED READS FILE #
+            ##############################################
+            count_reads_written = 0
+            count_neighbors_reads_written = 0
         
-        #with open(filtered_reads_file_name[:-3]+'_neighbors_indels.fa', 'w') as neighbors_filtered_reads_file:
-        # write the new filtered reads file (mutant-indels):
-        filtered_reads_file_basename_suffix = filtered_reads_file_basename.split('_')[-1].split('.')[0]
-        with open(str(path_to_data+prefix_date_and_id+'-mutant-indels'+'_'+barcode+'_'+filtered_reads_file_basename_suffix+'.fasta'), 'w') as neighbors_filtered_reads_file:
-            for pID in dict_all_reads.keys(): # for all pIDs                                
-                if not dict_neighbor_state_pIDs[pID]: # if pID "is not a neighbor"
-                    print 'write reads for pID: ' + pID
-
-                    # write the reads corresponding to pID
-                    for cur_read in dict_all_reads[pID]: # cur_read is a list [nb_occ_fwd,nb_occ_rev,seq]
-                        pid_orig = pID
-                        #read_id = cur_read[0]
-                        #read_dir = cur_read[1]
-                        mut_flag = ''
-                        new_read_id = str(pID+'_'+str(cur_read[0])+'_'+str(cur_read[1])+'_'+pid_orig+mut_flag)
-                        neighbors_filtered_reads_file.write('>'+new_read_id+'\n') # write the new id of the read
-                        neighbors_filtered_reads_file.write(cur_read[2]+'\n') # write the sequence
-                        count_reads_written += (cur_read[0]+cur_read[1])
-
-                    # write the neighbors if there are
-                    if (len(dict_neighbors[pID]) > 0):
-                        print '#neighbors pIDs: ' + str(len(dict_neighbors[pID]))
-                        for pID_n in dict_neighbors[pID]: # for all the neighbors pID_n of pID
-                            # write the neighbors pID_n
-                            for read_n in dict_all_reads[pID_n]: # read_n is a list [nb_occ_fwd,nb_occ_rev,seq]
-                                pid_orig = pID_n
-                                #read_id = read_n[0]
-                                #read_dir = read_n[1]
-                                mut_flag = '_m'
-                                new_read_id = str(pID+'_'+str(read_n[0])+'_'+str(read_n[1])+'_'+pid_orig+mut_flag)
-                                neighbors_filtered_reads_file.write('>'+new_read_id+'\n') # write the new id of the read
-                                neighbors_filtered_reads_file.write(read_n[2]+'\n') # write the sequence
-                                count_reads_written += (read_n[0]+read_n[1])
-                                count_neighbors_reads_written += (read_n[0]+read_n[1])
-
-
-        print 'nb reads written: '+str(count_reads_written)
-        print 'nb neighbored reads written: '+str(count_neighbors_reads_written)
+            # write the new filtered reads file (mutant-indels):
+            filtered_reads_file_basename_suffix = filtered_reads_file_basename.split('_')[-1].split('.')[0]
+            with open(str(path_to_data+prefix_date_and_id+'-mutant-indels'+'_'+barcode+'_'+filtered_reads_file_basename_suffix+'.fasta'), 'w') as neighbors_filtered_reads_file:
+                for pID in dict_all_reads.keys(): # for all pIDs                                
+                    if not dict_neighbor_state_pIDs[pID]: # if pID "is not a neighbor"
+                        print 'write reads for pID: ' + pID
+
+                        # write the reads corresponding to pID
+                        for cur_read in dict_all_reads[pID]: # cur_read is a list [nb_occ_fwd,nb_occ_rev,seq]
+                            pid_orig = pID
+                            new_read_id = str(pID+'_'+str(cur_read[0])+'_'+str(cur_read[1])+'_'+pid_orig)
+                            neighbors_filtered_reads_file.write('>'+new_read_id+'\n') # write the new id of the read
+                            neighbors_filtered_reads_file.write(cur_read[2]+'\n') # write the sequence
+                            count_reads_written += (cur_read[0]+cur_read[1])
+
+                        # write the neighbors if there are
+                        if (len(dict_neighbors[pID]) > 0):
+                            print '#neighbors pIDs: ' + str(len(dict_neighbors[pID]))
+                            for pID_n in dict_neighbors[pID]: # for all the neighbors pID_n of pID
+                            
+                                # write the neighbors pID_n
+                                for read_n in dict_all_reads[pID_n]: # read_n is a list [nb_occ_fwd,nb_occ_rev,seq]
+                                    pid_orig = pID_n
+                                    mut_flag = '_m'
+                                    new_read_id = str(pID+'_'+str(read_n[0])+'_'+str(read_n[1])+'_'+pid_orig+mut_flag)
+                                    neighbors_filtered_reads_file.write('>'+new_read_id+'\n') # write the new id of the read
+                                    neighbors_filtered_reads_file.write(read_n[2]+'\n') # write the sequence
+                                    count_reads_written += (read_n[0]+read_n[1])
+                                    count_neighbors_reads_written += (read_n[0]+read_n[1])
+
+
+            print 'nb reads written: '+str(count_reads_written)
+            print 'nb neighbored reads written: '+str(count_neighbors_reads_written)
         
-        print 'nb neighbored pIDs: '+str(nb_neighbors_stated)
+            print 'nb neighbored pIDs: '+str(nb_neighbors_stated)
 
         
+        else:
+            print 'PB: "barcode" and "prefix_date_and_id" extracted from input filtered reads file and consensus sequences file are different, Stop here!'
     else:
-        print 'PB: "barcode" and "prefix_date_and_id" extracted from input filtered reads file and consensus sequences file are different, Stop here!'
-else:
-    print auto_file_name + ': usage: '+ auto_file_name + ' <filtered reads file (in ../data/)> <cons. seq. file (in ../templates/dir-<date_and_id>_consensus/)>'
+        print auto_file_name + ': usage: '+ auto_file_name + ' <filtered reads file (in ../data/)> <cons. seq. file (in ../templates/dir-<date_and_id>_consensus/)>'
index b05e8ee..a725728 100755 (executable)
@@ -1,7 +1,4 @@
 #!/usr/bin/python
-#/!\#
-#### /ebio/ag-neher/share/pograms/EPD/bins/python
-#/!\#
 
 # script that decontaminates a filtered reads file for a given barcode and run id
 # input: - filtered reads file to decontaminate (in ../data/)
@@ -30,340 +27,297 @@ import lib_tools as lt
 
 auto_file_name=str(sys.argv[0])
 
-DIST_MAX = 7
+#####
+DIST_MAX = 7 # maximal distance allowed for the reads alignments
+#####
 
-#dict_lim_min_score_align={
-#            (1,'ACG'):160, (1,'CAG'):160, (1,'GTA'):160, (1,'GTC'):160,
-#            (2,'ACG'):163, (2,'CGT'):160, (2,'TAC'):160 #(2,'CGT'):145
-#            }
 
-#RUN = 2
-#
-#prefix_date_and_id=['2013-08-08-reg1','2013-08-09-reg2']
-#path_to_data='../templates/rep_neighbors_indels_consensus_'+prefix_date_and_id[RUN-1]+'/'
-#barcodes={'2013-08-08-reg1':['ACG','CAG','GTA','GTC'], '2013-08-09-reg2':['ACG','CGT','TAC']}
-#
-#path_to_seq_clone_sanger_patient='../data/seq_clone_and_Sanger_patient.fsa'
-
-
-########
-# DEF FUNCTIONS
-########
-
-########
-#def align_SW_dist(seq1,seq2):
-#    score = align.globalms(seq1, seq2, 1, 0, -0.6, -0.3)[0]
-#    return score
-########
+if __name__=='__main__':
 
+    if (len(sys.argv)==5):
 
-if (len(sys.argv)==5):
+        # parse the input file names
+        relative_filtered_reads_file_path = str(sys.argv[1])
+        relative_consensus_seq_file_path = str(sys.argv[2])
+        relative_ref_seq_file_path = str(sys.argv[3])
+        RUN = int(sys.argv[4])
 
-    # parse the input file names
-    relative_filtered_reads_file_path = str(sys.argv[1])
-    relative_consensus_seq_file_path = str(sys.argv[2])
-    relative_ref_seq_file_path = str(sys.argv[3])
-    RUN = int(sys.argv[4])
+        path_to_data="../data/"
+        path_to_templates="../templates/"
 
-    path_to_data="../data/"
-    path_to_templates="../templates/"
-
-    filtered_reads_file_basename = relative_filtered_reads_file_path.split('/')[-1]
-    consensus_seq_file_basename = relative_consensus_seq_file_path.split('/')[-1]
-    ref_seq_file_basename = relative_ref_seq_file_path.split('/')[-1]
+        filtered_reads_file_basename = relative_filtered_reads_file_path.split('/')[-1]
+        consensus_seq_file_basename = relative_consensus_seq_file_path.split('/')[-1]
+        ref_seq_file_basename = relative_ref_seq_file_path.split('/')[-1]
     
-    [prefix_date_and_id,barcode] = [filtered_reads_file_basename.split('_')[i] for i in [0,1]]
-    [prefix_date_and_id_cs,barcode_cs] = [consensus_seq_file_basename.split('_')[i] for i in [0,2]]
-    barcode_cs = barcode_cs.split('.')[0] #remove the extension ".fasta"
-
+        [prefix_date_and_id,barcode] = [filtered_reads_file_basename.split('_')[i] for i in [0,1]]
+        [prefix_date_and_id_cs,barcode_cs] = [consensus_seq_file_basename.split('_')[i] for i in [0,2]]
+        barcode_cs = barcode_cs.split('.')[0] #remove the extension ".fasta"
 
     
-    #check prefix_date_and_id and barcode similarity between filtered reads file and consensus sequences file
-    if ((prefix_date_and_id == prefix_date_and_id_cs) and (barcode == barcode_cs)):
-
-        # parse the reference sequences file in dict_ref_seq
-        dict_ref_seq={}
-        dict_len_ref_seq={}
-        dict_barcodes=defaultdict(list)
-        with open(relative_ref_seq_file_path,'r') as infile:
-            print '--Parsing the reference sequences file: '+relative_ref_seq_file_path
-            for record in SeqIO.parse(infile, 'fasta'):
-                run_id = int(record.id.split(' ')[0])
-                bc = str(record.id.split('_')[1])
-                print run_id
-                print bc
-                dict_barcodes[run_id].append(bc)
-                dict_ref_seq[(run_id,bc)]=record.seq
-                dict_len_ref_seq[(run_id,bc)]=len(record.seq)
-            # print dict_ref_seq
-            print dict_barcodes
-        NB_RUNS = len(dict_barcodes.keys())
-
-        #2.parse the consensus sequences files for the given prefix_date_and_id in dict_cons_seq
-        dict_cons_seq={}
-        dict_pIDs_occ={}
-        dict_score_alignments={}
-        print '--Parsing the consensus sequences file for run '+str(RUN)+', barcode '+barcode
-        with open(relative_path_to_consensus_seq_file_basename,'r') as infile:
-            for record in SeqIO.parse(infile, 'fasta'):
-                [pID, nb_reads_fwd, nb_reads_rev] = [record.id.split('_')[i] for i in [0,1,2]]
-                nb_reads_fwd = int(nb_reads_fwd)
-                nb_reads_rev = int(nb_reads_rev)
-                # [pID, nb_reads] = [str(record.id.split('_')[i]) for i in range(2)]
-                dict_cons_seq[pID]=record.seq
-                dict_pIDs_occ[pID]=(nb_reads_fwd,nb_reads_rev)
-                score_align = lt.align_SW_dist(dict_ref_seq[(RUN,barcode)], record.seq)[2]
-                print '----'+str(RUN)+': '+barcode+' - '+pID+': '+pID+', align score: '+str(score_align)
-                dict_score_alignments[pID]=score_align
-
-        # 3.parse the filtered reads file (for a given barcode) to check pIDs contamination dict_filtered_reads_pIDs_once_twice
-        #for bc in dict_barcodes[RUN]:
-        dict_filtered_reads_pIDs_once_twice=defaultdict(list)
-        dict_pIDs_occ_once_twice=defaultdict(int)
-        dict_score_alignments_once_twice=defaultdict(list)
-        print '--Parsing the filtered reads file for run '+str(RUN)+', barcode '+barcode
-        with open(relative_filtered_reads_file_path, 'r') as infile: # read the filtered reads file for the given barcode
-            for record in SeqIO.parse(infile,'fasta'): # for each read
-                [pID, nb_reads_fwd, nb_reads_rev] = [record.id.split('_')[i] for i in [0,1,2]]# retrieve the pID (parent, not the original) and the nb of fwd/rev reads
-                nb_reads_fwd = int(nb_reads_fwd)
-                nb_reads_rev = int(nb_reads_rev)
-                if not (pID in dict_cons_seq.keys()): # if pID is not already in dict_cons_seq => it appears only once or twice
-                    dict_filtered_reads_pIDs_once_twice[pID].append(str(seq)) # add the pID and read to the dictionary containing "onces" and "twices"
-                    dict_pIDs_occ_once_twice[pID]+=(nb_reads_fwd,nb_reads_rev)
+        # check prefix_date_and_id and barcode similarity between filtered reads file and consensus sequences file
+        if ((prefix_date_and_id == prefix_date_and_id_cs) and (barcode == barcode_cs)):
+
+            #
+            # 1.parse the reference sequences file in dict_ref_seq
+            #
+            dict_ref_seq={}
+            dict_len_ref_seq={}
+            dict_barcodes=defaultdict(list)
+            with open(relative_ref_seq_file_path,'r') as infile:
+                print '--Parsing the reference sequences file: '+relative_ref_seq_file_path
+                for record in SeqIO.parse(infile, 'fasta'):
+                    run_id = int(record.description.split(' ')[0])
+                    bc = str(record.description.split(' ')[1])
+                    print 'run_id: '+str(run_id)
+                    print 'bc: '+str(bc)
+                    dict_barcodes[run_id].append(bc)
+                    dict_ref_seq[(run_id,bc)]=record.seq
+                    dict_len_ref_seq[(run_id,bc)]=len(record.seq)
+
+                print dict_barcodes
+                NB_RUNS = len(dict_barcodes.keys())
+
+            #
+            # 2.parse the consensus sequences files for the given prefix_date_and_id in dict_cons_seq
+            #
+            dict_cons_seq={}
+            dict_pIDs_occ={}
+            dict_score_alignments={}
+            print '--Parsing the consensus sequences file for run '+str(RUN)+', barcode '+barcode
+            with open(relative_consensus_seq_file_path,'r') as infile:
+                for record in SeqIO.parse(infile, 'fasta'):
+                    [pID, nb_reads_fwd, nb_reads_rev] = [record.id.split('_')[i] for i in [0,1,2]]
+                    nb_reads_fwd = int(nb_reads_fwd)
+                    nb_reads_rev = int(nb_reads_rev)
+                    dict_cons_seq[pID]=record.seq
+                    dict_pIDs_occ[pID]=(nb_reads_fwd,nb_reads_rev)
                     score_align = lt.align_SW_dist(dict_ref_seq[(RUN,barcode)], record.seq)[2]
-                    print '----'+str(RUN)+': '+barcode+' - '+pID+': '+pID+', align score: '+str(score_align)
-                    dict_score_alignments_once_twice[pID].append(score_align)
-            print 'check occ pIDs in once twice dictionary: '+str(max([dict_pIDs_occ_once_twice[i] for i in dict_pIDs_occ_once_twice.keys()])<=2)
-
-
-        #4.print some statistics
-        #for bc in barcodes[prefix_date_and_id[RUN-1]]:
-        print '** LIM SCORE ALIGN-'+str(RUN)+': '+barcode+':'
-        print '-- PIDS OCC >= 3'
-        print '---- min = '+str(min([dict_score_alignments[i] for i in dict_score_alignments.keys()]))
-        print '---- max = '+str(max([dict_score_alignments[i] for i in dict_score_alignments.keys()]))
-        average_scores=sum([dict_score_alignments[i] for i in dict_score_alignments.keys()])/len(dict_score_alignments)
-        print '---- average score = '+str(average_scores)
-        print '---- #pids (total) = '+str(len(dict_pIDs_occ.keys()))
-        print '---- #reads (total) = '+str(sum([dict_pIDs_occ[pID] for pID in dict_pIDs_occ.keys()]))
-        print '-- 1<= PIDS OCC <= 2'
-        print '---- min = '+str(min([min(dict_score_alignments_once_twice[i]) for i in dict_score_alignments_once_twice.keys()]))
-        print '---- max = '+str(max([max(dict_score_alignments_once_twice[i]) for i in dict_score_alignments_once_twice.keys()]))
-        print '---- #pids (total) = '+str(len(dict_pIDs_occ_once_twice.keys()))
-        print '---- #reads (total) = '+str(sum([dict_pIDs_occ_once_twice[pID] for pID in dict_pIDs_occ_once_twice.keys()]))
-
-
-        #5.count good and bad alignments
-        # for bc in barcodes[prefix_date_and_id[RUN-1]]:
-
-        count_bad_scores = 0
-        nb_bad_scores_reads = 0
-        count_good_scores = 0
-        nb_good_scores_reads = 0
+                    print '----'+str(RUN)+': '+barcode+' - '+pID+': align score: '+str(score_align)
+                    dict_score_alignments[pID]=score_align
+
+            #        
+            # 3.parse the filtered reads file (for a given barcode) to check pIDs contamination dict_filtered_reads_pIDs_once_twice
+            #
+            dict_filtered_reads_pIDs_once_twice=defaultdict(list)
+            dict_pIDs_occ_once_twice=defaultdict(int)
+            dict_score_alignments_once_twice=defaultdict(list)
+            print '--Parsing the filtered reads file for run '+str(RUN)+', barcode '+barcode
+            with open(relative_filtered_reads_file_path, 'r') as infile: # read the filtered reads file for the given barcode
+                for record in SeqIO.parse(infile,'fasta'): # for each read
+                    [pID, nb_reads_fwd, nb_reads_rev] = [record.id.split('_')[i] for i in [0,1,2]]# retrieve the pID (parent, not the original) and the nb of occurrences fwd/rev of this read
+                    nb_reads_fwd = int(nb_reads_fwd)
+                    nb_reads_rev = int(nb_reads_rev)
+                    if not (pID in dict_cons_seq.keys()): # if pID is not already in dict_cons_seq => it appears only once or twice
+                        dict_filtered_reads_pIDs_once_twice[pID].append(str(record.seq)) # add the pID and read to the dictionary containing "onces" and "twices"
+                        dict_pIDs_occ_once_twice[pID] += nb_reads_fwd + nb_reads_rev
+                        score_align = lt.align_SW_dist(dict_ref_seq[(RUN,barcode)], record.seq)[2]
+                        print '----'+str(RUN)+': '+barcode+' - '+pID+': align score: '+str(score_align)
+                        dict_score_alignments_once_twice[pID].append(score_align)
+                print 'check occ pIDs in once twice dictionary: '+str(max([dict_pIDs_occ_once_twice[i] for i in dict_pIDs_occ_once_twice.keys()])<=2)
+                print 'check nb score alignments once twice dictionary: '+str(max([len(dict_score_alignments_once_twice[pID]) for pID in dict_score_alignments_once_twice.keys()])<=2)
+
+            #
+            # 4.print some statistics
+            #
+            print '** LIM SCORE ALIGN-'+str(RUN)+': '+barcode+':'
+            print '-- PIDS OCC >= 3'
+            print '---- min = '+str(min([dict_score_alignments[i] for i in dict_score_alignments.keys()]))
+            print '---- max = '+str(max([dict_score_alignments[i] for i in dict_score_alignments.keys()]))
+            average_scores=sum([dict_score_alignments[i] for i in dict_score_alignments.keys()])/len(dict_score_alignments)
+            print '---- average score = '+str(average_scores)
+            print '---- #pids (total) = '+str(len(dict_pIDs_occ.keys()))
+            print '---- #reads (total) = '+str(sum([dict_pIDs_occ[pID][0]+dict_pIDs_occ[pID][1] for pID in dict_pIDs_occ.keys()]))
+            print '-- 1<= PIDS OCC <= 2'
+            print '---- min = '+str(min([min(dict_score_alignments_once_twice[i]) for i in dict_score_alignments_once_twice.keys()]))
+            print '---- max = '+str(max([max(dict_score_alignments_once_twice[i]) for i in dict_score_alignments_once_twice.keys()]))
+            print '---- #pids (total) = '+str(len(dict_pIDs_occ_once_twice.keys()))
+            print '---- #reads (total) = '+str(sum([dict_pIDs_occ_once_twice[pID] for pID in dict_pIDs_occ_once_twice.keys()]))
+
+            #
+            # 5.count good and bad alignments
+            #
+            count_bad_scores = 0
+            nb_bad_scores_reads = 0
+            count_good_scores = 0
+            nb_good_scores_reads = 0
     
-        count_bad_scores_once_twice = 0
-        nb_bad_scores_reads_once_twice = 0
-        count_good_scores_once_twice = 0
-        nb_good_scores_reads_once_twice = 0
-
-        for pID in dict_score_alignments.keys():
-            if (dict_score_alignments[pID] <  dict_len_ref_seq[(RUN,barcode)]-DIST_MAX):
-                count_bad_scores += 1
-                nb_bad_scores_reads += dict_pIDs_occ[pID]
-            else:
-                count_good_scores += 1
-                nb_good_scores_reads += dict_pIDs_occ[pID]
-
-        for pID in dict_score_alignments_once_twice.keys():
-            if (max(dict_score_alignments_once_twice[pID]) <  dict_len_ref_seq[(RUN,barcode)]-DIST_MAX):
-                count_bad_scores_once_twice += 1
-                nb_bad_scores_reads_once_twice += dict_pIDs_occ_once_twice[pID]
-            else:
-                count_good_scores_once_twice += 1
-                nb_good_scores_reads_once_twice += dict_pIDs_occ_once_twice[pID]
-
-        print 'PIDS OCC >= 3:'
-        print '--RUN '+str(RUN)+': '+barcode+': #bad align scores: '+str(count_bad_scores)
-        print '--RUN '+str(RUN)+': '+barcode+': #bad align scores (reads): '+str(nb_bad_scores_reads)
-        print '--RUN '+str(RUN)+': '+barcode+': #good align scores: '+str(count_good_scores)
-        print '--RUN '+str(RUN)+': '+barcode+': #good align scores (reads): '+str(nb_good_scores_reads)
-
-        print '1 <= PIDS OCC <= 2:'
-        print '--RUN '+str(RUN)+': '+barcode+': #bad align scores: '+str(count_bad_scores_once_twice)
-        print '--RUN '+str(RUN)+': '+barcode+': #bad align scores (reads): '+str(nb_bad_scores_reads_once_twice)
-        print '--RUN '+str(RUN)+': '+barcode+': #good align scores: '+str(count_good_scores_once_twice)
-        print '--RUN '+str(RUN)+': '+barcode+': #good align scores (reads): '+str(nb_good_scores_reads_once_twice)
-        print '****'
+            count_bad_scores_once_twice = 0
+            nb_bad_scores_reads_once_twice = 0
+            count_good_scores_once_twice = 0
+            nb_good_scores_reads_once_twice = 0
+
+            for pID in dict_score_alignments.keys():
+                if (dict_score_alignments[pID] <  dict_len_ref_seq[(RUN,barcode)]-DIST_MAX):
+                    count_bad_scores += 1
+                    nb_bad_scores_reads += dict_pIDs_occ[pID][0] + dict_pIDs_occ[pID][1]
+                else:
+                    count_good_scores += 1
+                    nb_good_scores_reads += dict_pIDs_occ[pID][0] + dict_pIDs_occ[pID][1]
 
+            for pID in dict_score_alignments_once_twice.keys():
+                if (max(dict_score_alignments_once_twice[pID]) <  dict_len_ref_seq[(RUN,barcode)]-DIST_MAX):
+                    count_bad_scores_once_twice += 1
+                    nb_bad_scores_reads_once_twice += dict_pIDs_occ_once_twice[pID]
+                else:
+                    count_good_scores_once_twice += 1
+                    nb_good_scores_reads_once_twice += dict_pIDs_occ_once_twice[pID]
+
+            print 'PIDS OCC >= 3:'
+            print '--RUN '+str(RUN)+': '+barcode+': #bad align scores: '+str(count_bad_scores)
+            print '--RUN '+str(RUN)+': '+barcode+': #bad align scores (reads): '+str(nb_bad_scores_reads)
+            print '--RUN '+str(RUN)+': '+barcode+': #good align scores: '+str(count_good_scores)
+            print '--RUN '+str(RUN)+': '+barcode+': #good align scores (reads): '+str(nb_good_scores_reads)
+
+            print '1 <= PIDS OCC <= 2:'
+            print '--RUN '+str(RUN)+': '+barcode+': #bad align scores: '+str(count_bad_scores_once_twice)
+            print '--RUN '+str(RUN)+': '+barcode+': #bad align scores (reads): '+str(nb_bad_scores_reads_once_twice)
+            print '--RUN '+str(RUN)+': '+barcode+': #good align scores: '+str(count_good_scores_once_twice)
+            print '--RUN '+str(RUN)+': '+barcode+': #good align scores (reads): '+str(nb_good_scores_reads_once_twice)
+            print '****'
+
+            #
+            # 6.check contamination
+            #
+            print '********'
+            print 'Check contamination'
+            dict_possible_good_align={}#contains the alignment scores of the pIDs cons. seq. for bc with the ref. seq. of the other barcodes
+            dict_reclassif={}
+            dict_count_reclassif_pIDs=defaultdict(int)
+            dict_count_reclassif_reads=defaultdict(int)
+            print 'RUN '+str(RUN)+' - barcode '+barcode+':'
+            #index_bc = barcodes[prefix_date_and_id[RUN-1]].index(bc)#index of the barcode bc in the barcodes list
         
-        # 6.check contamination
-        print '********'
-        print 'Check contamination'
-        dict_possible_good_align={}#contains the alignment scores of the pIDs cons. seq. for bc with the ref. seq. of the other barcodes
-        dict_reclassif={}
-        dict_count_reclassif_pIDs=defaultdict(int)
-        dict_count_reclassif_reads=defaultdict(int)
-        #for bc in barcodes[prefix_date_and_id[RUN-1]]:
-        print 'RUN '+str(RUN)+' - barcode '+barcode+':'
-        #dict_possible_good_align={}
-        #dict_reclassif={}
-        #dict_count_reclassif_pIDs=defaultdict(int)
-        #dict_count_reclassif_reads=defaultdict(int)
-        index_bc = barcodes[prefix_date_and_id[RUN-1]].index(bc)#index of the barcode bc in the barcodes list
-        #barcodes_to_test = barcodes[prefix_date_and_id[RUN-1]][:index_bc]+barcodes[prefix_date_and_id[RUN-1]][index_bc+1:]#list of barcodes - {bc}
-        barcodes_to_test = [i for i in dict_ref_seq.keys() if (i!=(RUN,barcode))]
-
-        # loop for pIDs with occ >= 3
-        print 'loop for pIDs with occ >= 3'
-        for pID in dict_score_alignments.keys():
-            if (dict_score_alignments[pID] < dict_len_ref_seq[(RUN,barcode)]-DIST_MAX):
-                # if the alignment score of cons. seq. for pID with the reference sequence for bc is lower than the threshold
-                print '---found 1 bad alignment: '+pID+', '+str(dict_score_alignments[pID])+' < '+str(dict_len_ref_seq[(RUN,barcode)]-DIST_MAX)
-                dict_possible_good_align[pID]=[]
-                dict_reclassif[pID]=[]
-                
-                for (run2,bc2) in barcodes_to_test: # compare the alignment scores of cons. seq. pID with the ref. seq. for the other barcodes
-                    score_align_with_bc2 = lt.align_SW_dist(dict_ref_seq[(run2,bc2)], dict_cons_seq[pID])[2]
-                    dict_possible_good_align[pID].append((run2,bc2,score_align_with_bc2))
-                    if(score_align_with_bc2 >= dict_len_ref_seq[(run2,bc2)]-DIST_MAX):
-                        print '------possible reclassif of pID '+pID+' (#occ: '+str(dict_pIDs_occ[pID])+') in run '+str(run2) +', bc2 '+bc2+', with align score: '+str(score_align_with_bc2)+' >= '+str(dict_len_ref_seq[(run2,bc2)]-DIST_MAX)
-                        dict_count_reclassif_pIDs[(run2,bc2)]+=1
-                        dict_count_reclassif_reads[(run2,bc2)]+=dict_pIDs_occ[pID]
-                        dict_reclassif[pID].append((run2,bc2,score_align_with_bc2))
-
-                #for bc3 in barcodes[prefix_date_and_id[RUN%2]]: # compare the alignment scores of cons. seq. pID with the ref. seq. for the barcodes of the other run
-                #    score_align_with_bc3 = align_SW_dist(dict_scsp[((RUN%2)+1,bc3)], dict_cons_seq[(RUN,bc)][pID])[2]
-                #    dict_possible_good_align[(RUN,bc)][pID].append(((RUN%2)+1,bc3,score_align_with_bc3))
-                #    if(score_align_with_bc3 >= dict_lim_min_score_align[((RUN%2)+1, bc3)]):
-                #        print '------possible reclassif of pID '+pID+' (#occ: '+str(dict_pIDs_occ[(RUN,bc)][pID])+') in run '+str((RUN%2)+1)+', bc3 '+bc3+', with align score: '+str(score_align_with_bc3)+' >= '+str(dict_lim_min_score_align[((RUN%2)+1, bc3)])
-                #        dict_count_reclassif_pIDs[(RUN,bc)][((RUN%2)+1,bc3)]+=1
-                #        dict_count_reclassif_reads[(RUN,bc)][((RUN%2)+1,bc3)]+=dict_pIDs_occ[(RUN,bc)][pID]
-                #        dict_reclassif[(RUN,bc)][pID].append(((RUN%2)+1,bc3,score_align_with_bc3))
-
-
-        # loop for pIDs with 1 <= occ <= 2
-        print 'loop for pIDs with 1 <= occ <=2'
-        for pID in dict_score_alignments_once_twice.keys():
-            if (max(dict_score_alignments_once_twice[pID]) < dict_len_ref_seq[(RUN,barcode)]-DIST_MAX):
-                # if the alignment score (occ=1) or the best one (occ=2) for pID with the reference sequence for bc is lower than the threshold
-                print '---found 1 bad alignment: '+pID+', '+str(max(dict_score_alignments_once_twice[pID]))+' < '+str(dict_len_ref_seq[(RUN,barcode)]-DIST_MAX)
-                dict_possible_good_align[pID]=[]
-                dict_reclassif[pID]=[]
-                #nb_occ_cur_pID = len(dict_filtered_reads_pIDs_once_twice[(RUN,bc)][pID])      
-                nb_occ_cur_pID = sum([dict_pIDs_occ_once_twice[pID][i] for i in [0,1]])
+            barcodes_to_test = [i for i in dict_ref_seq.keys() if (i!=(RUN,barcode))]
+
+            # loop for pIDs with occ >= 3
+            print 'loop for pIDs with occ >= 3'
+            for pID in dict_score_alignments.keys():
+                if (dict_score_alignments[pID] < dict_len_ref_seq[(RUN,barcode)]-DIST_MAX):
+                    # if the alignment score of cons. seq. for pID with the reference sequence for bc is lower than the threshold
+                    print '---found 1 bad alignment: '+pID+', '+str(dict_score_alignments[pID])+' < '+str(dict_len_ref_seq[(RUN,barcode)]-DIST_MAX)
+                    dict_possible_good_align[pID]=[]
+                    dict_reclassif[pID]=[]
                 
-                for bc2 in barcodes_to_test: # compare the alignment scores of the sequence(s) of pID with the ref. seq. for the other barcodes
-                    score_align_with_bc2 = max([lt.align_SW_dist(dict_ref_seq[(run2,bc2)], dict_filtered_reads_pIDs_once_twice[pID][i])[2] for i in range(nb_occ_cur_pID)])
-                    dict_possible_good_align[pID].append((run2,bc2,score_align_with_bc2))
-                    if(score_align_with_bc2 >= dict_len_ref_seq[(run2,bc2)]-DIST_MAX):
-                        print '------possible reclassif of pID '+pID+' (#occ: '+str(nb_occ_cur_pID)+') in run '+str(run2) +', bc2 '+bc2+', with align score: '+str(score_align_with_bc2)+' >= '+str(dict_len_ref_seq[(run2,bc2)]-DIST_MAX)
-                        dict_count_reclassif_pIDs[(run2,bc2)]+=1
-                        dict_count_reclassif_reads[(run2,bc2)]+=nb_occ_cur_pID
-                        dict_reclassif[pID].append((run2,bc2,score_align_with_bc2))
+                    for (run2,bc2) in barcodes_to_test: # compare the alignment scores of cons. seq. pID with the ref. seq. for the other barcodes
+                        score_align_with_bc2 = lt.align_SW_dist(dict_ref_seq[(run2,bc2)], dict_cons_seq[pID])[2]
+                        dict_possible_good_align[pID].append((run2,bc2,score_align_with_bc2))
+                        if(score_align_with_bc2 >= dict_len_ref_seq[(run2,bc2)]-DIST_MAX):
+                            print '------possible reclassif of pID '+pID+' (#occ (fwd, rev)): '+str(dict_pIDs_occ[pID])+') in run '+str(run2) +', bc2 '+bc2+', with align score: '+str(score_align_with_bc2)+' >= '+str(dict_len_ref_seq[(run2,bc2)]-DIST_MAX)
+                            dict_count_reclassif_pIDs[(run2,bc2)]+=1
+                            dict_count_reclassif_reads[(run2,bc2)]+=dict_pIDs_occ[pID][0]+dict_pIDs_occ[pID][1]
+                            dict_reclassif[pID].append((run2,bc2,score_align_with_bc2))
 
-
-                #for bc3 in barcodes[prefix_date_and_id[RUN%2]]: # compare the alignment scores of the sequence(s) of pID with the ref. seq. for the barcodes of the other run
-                #    score_align_with_bc3 = max([align_SW_dist(dict_scsp[((RUN%2)+1,bc3)], dict_filtered_reads_pIDs_once_twice[(RUN,bc)][pID][i])[2] for i in range(nb_occ_cur_pID)])
-                #    dict_possible_good_align[(RUN,bc)][pID].append(((RUN%2)+1,bc3,score_align_with_bc3))
-                #    if(score_align_with_bc3 >= dict_lim_min_score_align[((RUN%2)+1, bc3)]):
-                #        print '------possible reclassif of pID '+pID+' (#occ: '+str(nb_occ_cur_pID)+') in run '+str((RUN%2)+1)+', bc3 '+bc3+', with align score: '+str(score_align_with_bc3)+' >= '+str(dict_lim_min_score_align[((RUN%2)+1, bc3)])
-                #        dict_count_reclassif_pIDs[(RUN,bc)][((RUN%2)+1,bc3)]+=1
-                #        dict_count_reclassif_reads[(RUN,bc)][((RUN%2)+1,bc3)]+=dict_pIDs_occ_once_twice[(RUN,bc)][pID]
-                #        dict_reclassif[(RUN,bc)][pID].append(((RUN%2)+1,bc3,score_align_with_bc3))
-
-
-
-        #7.print contamination statistics
-        
-        # for bc in barcodes[prefix_date_and_id[RUN-1]]:
-        count_pIDs_reclassif = 0
-        count_pIDs_no_good_alignments = 0
-        count_reads_no_good_alignments = 0
-        for pID in dict_reclassif.keys():
-            if len(dict_reclassif[pID])>0:
-                count_pIDs_reclassif+=1
-            else:
-                count_pIDs_no_good_alignments +=1
-                if pID in dict_pIDs_occ.keys():
-                    count_reads_no_good_alignments += dict_pIDs_occ[pID]
+            # loop for pIDs with 1 <= occ <= 2
+            print 'loop for pIDs with 1 <= occ <= 2'
+            for pID in dict_score_alignments_once_twice.keys():
+                if (max(dict_score_alignments_once_twice[pID]) < dict_len_ref_seq[(RUN,barcode)]-DIST_MAX):
+                # if the alignment score (#occ=1) or (#occ=2 (same read)) or the best one (#occ=2 (distinct reads)) for pID with the reference sequence for bc is lower than the threshold
+                    print '---found 1 bad alignment: '+pID+', '+str(max(dict_score_alignments_once_twice[pID]))+' < '+str(dict_len_ref_seq[(RUN,barcode)]-DIST_MAX)
+                    dict_possible_good_align[pID]=[]
+                    dict_reclassif[pID]=[]
+                    nb_occ_cur_pID = dict_pIDs_occ_once_twice[pID]
+                    for (run2,bc2) in barcodes_to_test: # compare the alignment scores of the sequence(s) of pID with the ref. seq. for the other barcodes
+                        score_align_with_bc2 = max([lt.align_SW_dist(dict_ref_seq[(run2,bc2)], dict_filtered_reads_pIDs_once_twice[pID][i])[2] for i in range(len(dict_filtered_reads_pIDs_once_twice[pID]))])
+                        dict_possible_good_align[pID].append((run2,bc2,score_align_with_bc2))
+                        if(score_align_with_bc2 >= dict_len_ref_seq[(run2,bc2)]-DIST_MAX):
+                            print '------possible reclassif of pID '+pID+' (#occ: '+str(nb_occ_cur_pID)+') in run '+str(run2) +', bc2 '+bc2+', with align score: '+str(score_align_with_bc2)+' >= '+str(dict_len_ref_seq[(run2,bc2)]-DIST_MAX)
+                            dict_count_reclassif_pIDs[(run2,bc2)]+=1
+                            dict_count_reclassif_reads[(run2,bc2)]+=nb_occ_cur_pID
+                            dict_reclassif[pID].append((run2,bc2,score_align_with_bc2))
+
+
+            #
+            # 7.print contamination statistics
+            #
+            count_pIDs_reclassif = 0
+            count_pIDs_no_good_alignments = 0
+            count_reads_no_good_alignments = 0
+            for pID in dict_reclassif.keys():
+                if len(dict_reclassif[pID])>0:
+                    count_pIDs_reclassif+=1
                 else:
-                    count_reads_no_good_alignments += sum([dict_pIDs_occ_once_twice[pID][i] for i in [0,1]]) #dict_pIDs_occ_once_twice[(RUN,bc)][pID]
-
-        print 'RUN '+str(RUN)+' - barcode '+barcode+': #pIDs with better alignments from other barcodes: '+str(count_pIDs_reclassif)
-        print '-----> #pIDs with no good alignments: '+ str(count_pIDs_no_good_alignments)
-        print '-----> #reads with no good alignments: '+ str(count_reads_no_good_alignments)
-        print '##########'
-        #for bc in barcodes[prefix_date_and_id[RUN-1]]:
-        #print 'RUN '+str(RUN)+' - barcode '+barcode+':'
-        #print '--> #pIDs from:  '+str(dict_count_reclassif_pIDs[(RUN,bc)])
-        #print '--> #reads from: '+str(dict_count_reclassif_reads[(RUN,bc)])
-
-
-
-        #8.decontaminate the filtered reads (neighbors-indels) file
-        #dict_count_pIDs_to_bad_aligned_file={}
-        #dict_count_pIDs_from_contamination={}
-        #dict_count_pIDs_to_decontaminated_file={}
-
-        # for bc in barcodes[prefix_date_and_id[RUN-1]]:
-        dict_count_pIDs_to_bad_aligned_file=defaultdict(int)
-        dict_count_pIDs_from_contamination=defaultdict(int)
-        dict_count_pIDs_to_decontaminated_file=defaultdict(int)
-        list_of_reads_to_write_in_bad_aligned_file=[]
-        list_of_reads_to_write_in_decontaminated_file=[]
-        with open(relative_filtered_reads_file_path, 'r') as infile:
-        #with open(str('../data/'+prefix_date_and_id[RUN-1]+'_'+bc+'_sorted_and_filtered_reads_SW_min_occ_same_pid_1_neighbors_indels.fa'), 'r') as infile:
-            for record in SeqIO.parse(infile,'fasta'):
-                pID = record.id.split('_')[0]
-                if (pID in dict_reclassif.keys()): # if the seq. for pID aligns badly with the ref. seq. of its barcode 
-                    if not (len(dict_reclassif[pID])>0): # and if it does not align well with the ref. seq. of the other barcodes (run 1 and run 2)
-                        # don't know what to do with them: keep them in a file
-                        list_of_reads_to_write_in_bad_aligned_file.append(record)
-                        dict_count_pIDs_to_bad_aligned_file[pID]+=1
-                    else: # it aligns well with the ref. seq. of an other barcode (run 1 or run 2)
-                        # just throw it away
-                        dict_count_pIDs_from_contamination[pID]+=1
-                else: # the seq. for pID aligns well with the ref. seq. of its barcode
-                    # to be written in the filtered reads (neighbors_indels) decontaminated file
-                    list_of_reads_to_write_in_decontaminated_file.append(record)
-                    dict_count_pIDs_to_decontaminated_file[pID]+=1
-
-        # write the filtered reads (neighbors_indels) decontaminated file
-        suffix_filtered_reads_file = filtered_reads_file_basename.split('_')[-1]
-        with open(str(path_to_data+prefix_date_and_id+'-decontaminated'+'_'+barcode+suffix_filtered_reads_file), 'w') as outfile:
-            print 'write decontaminated file for run '+str(RUN)+', barcode: '+barcode
-            for record in list_of_reads_to_write_in_decontaminated_file:
-                outfile.write('>'+str(record.id)+'\n')
-                outfile.write(str(record.seq)+'\n')
-            print '-- #reads written: '+str(len(list_of_reads_to_write_in_decontaminated_file))
-
-        # write the (neighbors_indels) bad aligned reads file
-        with open(str(path_to_data+prefix_date_and_id+'-bad-aligned'+'_'+barcode+suffix_filtered_reads_file), 'w') as outfile:
-            print 'write bad aligned reads file for run '+str(RUN)+', barcode: '+bc
-            for record in list_of_reads_to_write_in_bad_aligned_file:
-                outfile.write('>'+str(record.id)+'\n')
-                outfile.write(str(record.seq)+'\n')
-            print '-- #reads written: '+str(len(list_of_reads_to_write_in_bad_aligned_file))
-    
-        # 9. print statistics
-        print '#############'
-        #for bc in barcodes[prefix_date_and_id[RUN-1]]:
-        print '-RUN '+str(RUN)+', barcode '+bc
-        print '---> nb non contaminant pIDs: '+str(len(dict_count_pIDs_to_decontaminated_file.keys()))
-        print '---> nb non contaminant reads: '+str(sum([dict_count_pIDs_to_decontaminated_file[i] for i in dict_count_pIDs_to_decontaminated_file.keys()]))
-        print '--'
-        print '---> nb contaminant pIDs: '+str(len(dict_count_pIDs_from_contamination.keys()))
-        print '---> nb contaminant reads: '+str(sum([dict_count_pIDs_from_contamination[i] for i in dict_count_pIDs_from_contamination.keys()]))
-        print '--'
-        print '---> nb bad aligned but non contaminant pIDs : '+str(len(dict_count_pIDs_to_bad_aligned_file.keys()))
-        print '---> nb bad aligned but non contaminant reads: '+str(sum([dict_count_pIDs_to_bad_aligned_file[i] for i in dict_count_pIDs_to_bad_aligned_file.keys()]))
+                    count_pIDs_no_good_alignments +=1
+                    if pID in dict_pIDs_occ.keys():
+                        count_reads_no_good_alignments += dict_pIDs_occ[pID][0]+dict_pIDs_occ[pID][1]
+                    else:
+                        count_reads_no_good_alignments += dict_pIDs_occ_once_twice[pID] 
 
+            print 'RUN '+str(RUN)+' - barcode '+barcode+': #pIDs with better alignments from other barcodes: '+str(count_pIDs_reclassif)
+            print '-----> #pIDs with no good alignments: '+ str(count_pIDs_no_good_alignments)
+            print '-----> #reads with no good alignments: '+ str(count_reads_no_good_alignments)
+            print '##########'
 
+        
+            #
+            # 8.decontaminate the filtered reads (mutant-indels) file
+            #
+            dict_count_pIDs_to_bad_aligned_file=defaultdict(int)
+            dict_count_pIDs_from_contamination=defaultdict(int)
+            dict_count_pIDs_to_decontaminated_file=defaultdict(int)
+            list_of_reads_to_write_in_bad_aligned_file=[]
+            list_of_reads_to_write_in_decontaminated_file=[]
+            list_of_reads_to_throw_away_contamination = []
+            with open(relative_filtered_reads_file_path, 'r') as infile:
+                for record in SeqIO.parse(infile,'fasta'):
+                    pID = record.id.split('_')[0]
+                    if (pID in dict_reclassif.keys()): # if the seq. for pID aligns badly with the ref. seq. of its barcode 
+                        if not (len(dict_reclassif[pID])>0): # and if it does not align well with the ref. seq. of the other barcodes (run 1 and run 2)
+                            # don't know what to do with them: keep them in a file
+                            list_of_reads_to_write_in_bad_aligned_file.append(record)
+                            dict_count_pIDs_to_bad_aligned_file[pID]+=1
+                        else: # it aligns well with the ref. seq. of an other barcode (and/or an other run)
+                            # just throw it away
+                            dict_count_pIDs_from_contamination[pID]+=1
+                            list_of_reads_to_throw_away_contamination.append(record)
+                    else: # the seq. for pID aligns well with the ref. seq. of its barcode
+                        # to be written in the filtered reads (neighbors_indels) decontaminated file
+                        list_of_reads_to_write_in_decontaminated_file.append(record)
+                        dict_count_pIDs_to_decontaminated_file[pID]+=1
+
+            suffix_filtered_reads_file = filtered_reads_file_basename.split('_')[-1]
+            # write the filtered reads decontaminated file
+            total_reads_written_in_decontaminated_file = 0
+            with open(str(path_to_data+prefix_date_and_id+'-decontaminated'+'_'+barcode+'_'+suffix_filtered_reads_file), 'w') as outfile:
+                print 'write decontaminated file for run '+str(RUN)+', barcode: '+barcode
+                for record in list_of_reads_to_write_in_decontaminated_file:
+                    outfile.write('>'+str(record.id)+'\n')
+                    outfile.write(str(record.seq)+'\n')
+                    total_reads_written_in_decontaminated_file += sum([int(record.id.split('_')[i]) for i in [1,2]])
+                print '---> nb non contaminant pIDs: '+str(len(dict_count_pIDs_to_decontaminated_file.keys()))
+                print '-- #distinct reads written: '+str(len(list_of_reads_to_write_in_decontaminated_file))
+                print '-- #total reads written in decontaminated file: '+str(total_reads_written_in_decontaminated_file)
+
+            # write the bad aligned reads file
+            total_reads_written_in_bad_aligned_reads_file = 0
+            with open(str(path_to_data+prefix_date_and_id+'-bad-aligned'+'_'+barcode+'_'+suffix_filtered_reads_file), 'w') as outfile:
+                print 'write bad aligned reads file for run '+str(RUN)+', barcode: '+barcode
+                for record in list_of_reads_to_write_in_bad_aligned_file:
+                    outfile.write('>'+str(record.id)+'\n')
+                    outfile.write(str(record.seq)+'\n')
+                    total_reads_written_in_bad_aligned_reads_file += sum([int(record.id.split('_')[i]) for i in [1,2]])
+                print '---> nb bad aligned but non contaminant pIDs : '+str(len(dict_count_pIDs_to_bad_aligned_file.keys()))
+                print '-- #distinct reads written: '+str(len(list_of_reads_to_write_in_bad_aligned_file))
+                print '-- #total reads written in bad aligned file: '+str(total_reads_written_in_bad_aligned_reads_file)
+                
+            # just count the reads to throw away (contaminant)
+            total_reads_to_throw_away = 0
+            print 'count the contaminant reads:'
+            for record in list_of_reads_to_throw_away_contamination:
+                total_reads_to_throw_away += sum([int(record.id.split('_')[i]) for i in [1,2]])
+            print '---> nb contaminant pIDs: '+str(len(dict_count_pIDs_from_contamination.keys()))
+            print '-- #distinct reads thrown away: '+str(len(list_of_reads_to_throw_away_contamination))
+            print '-- #total reads to throw away: '+str(total_reads_to_throw_away)
+        
+           
+            # print '---> nb non contaminant reads: '+str(sum([dict_count_pIDs_to_decontaminated_file[i] for i in dict_count_pIDs_to_decontaminated_file.keys()]))
+            # print '---> nb contaminant reads: '+str(sum([dict_count_pIDs_from_contamination[i] for i in dict_count_pIDs_from_contamination.keys()]))
+            # print '---> nb bad aligned but non contaminant reads: '+str(sum([dict_count_pIDs_to_bad_aligned_file[i] for i in dict_count_pIDs_to_bad_aligned_file.keys()]))
 
+        else:
+            print 'PB: prefix_date_and_id and barcode from filtered reads file and consensus sequences file are not similar ! Stop here !'
     else:
-        print 'PB: prefix_date_and_id and barcode from filtered reads file and consensus sequences file are not similar ! Stop here !'
-else:
-    print auto_file_name+': usage: '+auto_file_name+' <filtered reads file (in ../data/)> <consensus seq. file (in ../templates/<date-and-id>_consensus/)> <ref. seq. file (in ../data/)> <run_number>'
+        print auto_file_name+': usage: '+auto_file_name+' <filtered reads file (in ../data/)> <consensus seq. file (in ../templates/<date-and-id>_consensus/)> <ref. seq. file (in ../data/)> <run_number>'