rename, merge and debug (not yet finished) the decontamination and indels detection...
authorebenard <ebenard@ag-neher-benard.(none)>
Fri, 20 Sep 2013 16:47:14 +0000 (18:47 +0200)
committerebenard <ebenard@ag-neher-benard.(none)>
Fri, 20 Sep 2013 16:47:14 +0000 (18:47 +0200)
src/p7_decontamination.py
src/p8_detect_mutants_indels.py

index 801afb0..541a887 100644 (file)
@@ -54,66 +54,70 @@ if (len(sys.argv)==5):
     for pID in dict_all_reads:
         if pID in dict_cons_seq:
             score_align = lt.align_SW_dist(dict_ref_seq[true_seq_id], dict_cons_seq[pID][2])[2]
-            print '----'+str(RUN)+': '+barcode+' - '+pID+': '+pID+', align score: '+str(score_align)
+            print '----'+str(true_seq_id)+': '+pID+', align score: '+str(score_align)
             dict_score_alignments[pID].append(score_align)
         else:
             for nf, nb, seq in dict_all_reads[pID]:
                 score_align = lt.align_SW_dist(dict_ref_seq[true_seq_id], dict_cons_seq[pID][2])[2]
-                print '----'+str(RUN)+': '+barcode+' - '+pID+': '+pID+', align score: '+str(score_align)
+                print '----'+str(true_seq_id)+': '+pID+', align score: '+str(score_align)
                 dict_score_alignments[pID].append(score_align)
 
 
     #4.print some statistics
-    #for bc in barcodes[prefix_date_and_id[RUN-1]]:
     print '** LIM SCORE ALIGN-'+true_seq_id
-    print '---- min = '+str(min([dict_score_alignments[i] for i in dict_score_alignments.keys()]))
-    print '---- max = '+str(max([dict_score_alignments[i] for i in dict_score_alignments.keys()]))
-    average_scores=np.mean([dict_score_alignments[i] for i in dict_score_alignments.keys()])
+    #print '---- min = '+str(min([dict_score_alignments[i] for i in dict_score_alignments.keys()]))
+    print '---- min = '+str(min(np.hstack(dict_score_alignments.values())))
+    #print '---- max = '+str(max([dict_score_alignments[i] for i in dict_score_alignments.keys()]))
+    print '---- min = '+str(max(np.hstack(dict_score_alignments.values())))
+    #average_scores=np.mean([dict_score_alignments[i] for i in dict_score_alignments.keys()])
+    average_scores=np.mean(np.hstack(dict_score_alignments.values()))
     print '---- average score = '+str(average_scores)
     print '---- #pids (total) = '+str(len(dict_all_reads.keys()))
 
 
     #5.count good and bad alignments
-    # for bc in barcodes[prefix_date_and_id[RUN-1]]:
+    
 
-    count_bad_scores = sum([x<len(ref_seq[true_seq_id])-DIST_MAX for x in reads for n dict_score_alignments])
+    count_bad_scores = sum([x<len(ref_seq[true_seq_id])-DIST_MAX for x in reads for reads in dict_score_alignments])
     nb_bad_scores_reads = sum([x<len(ref_seq[true_seq_id])-DIST_MAX for x in reads for reads in dict_score_alignments])
     count_good_scores = sum([x>=len(ref_seq[true_seq_id])-DIST_MAX for x in reads for reads in dict_score_alignments])
     nb_good_scores_reads = sum([x<len(ref_seq[true_seq_id])-DIST_MAX for x in reads for reads in dict_score_alignments])
 
-    count_bad_scores_once_twice = 0
-    nb_bad_scores_reads_once_twice = 0
-    count_good_scores_once_twice = 0
-    nb_good_scores_reads_once_twice = 0
-
-    for pID in dict_score_alignments.keys():
-        if (dict_score_alignments[pID] <  dict_len_ref_seq[(RUN,barcode)]-DIST_MAX):
-            count_bad_scores += 1
-            nb_bad_scores_reads += dict_pIDs_occ[pID]
-        else:
-            count_good_scores += 1
-            nb_good_scores_reads += dict_pIDs_occ[pID]
-
-    for pID in dict_score_alignments_once_twice.keys():
-        if (max(dict_score_alignments_once_twice[pID]) <  dict_len_ref_seq[(RUN,barcode)]-DIST_MAX):
-            count_bad_scores_once_twice += 1
-            nb_bad_scores_reads_once_twice += dict_pIDs_occ_once_twice[pID]
-        else:
-            count_good_scores_once_twice += 1
-            nb_good_scores_reads_once_twice += dict_pIDs_occ_once_twice[pID]
-
-    print 'PIDS OCC >= 3:'
-    print '--RUN '+str(RUN)+': '+barcode+': #bad align scores: '+str(count_bad_scores)
-    print '--RUN '+str(RUN)+': '+barcode+': #bad align scores (reads): '+str(nb_bad_scores_reads)
-    print '--RUN '+str(RUN)+': '+barcode+': #good align scores: '+str(count_good_scores)
-    print '--RUN '+str(RUN)+': '+barcode+': #good align scores (reads): '+str(nb_good_scores_reads)
-
-    print '1 <= PIDS OCC <= 2:'
-    print '--RUN '+str(RUN)+': '+barcode+': #bad align scores: '+str(count_bad_scores_once_twice)
-    print '--RUN '+str(RUN)+': '+barcode+': #bad align scores (reads): '+str(nb_bad_scores_reads_once_twice)
-    print '--RUN '+str(RUN)+': '+barcode+': #good align scores: '+str(count_good_scores_once_twice)
-    print '--RUN '+str(RUN)+': '+barcode+': #good align scores (reads): '+str(nb_good_scores_reads_once_twice)
+    # count_bad_scores_once_twice = 0
+    # nb_bad_scores_reads_once_twice = 0
+    # count_good_scores_once_twice = 0
+    # nb_good_scores_reads_once_twice = 0
+
+    # for pID in dict_score_alignments.keys():
+    #     if (dict_score_alignments[pID] <  dict_len_ref_seq[(RUN,barcode)]-DIST_MAX):
+    #         count_bad_scores += 1
+    #         nb_bad_scores_reads += dict_pIDs_occ[pID]
+    #     else:
+    #         count_good_scores += 1
+    #         nb_good_scores_reads += dict_pIDs_occ[pID]
+
+    # for pID in dict_score_alignments_once_twice.keys():
+    #     if (max(dict_score_alignments_once_twice[pID]) <  dict_len_ref_seq[(RUN,barcode)]-DIST_MAX):
+    #         count_bad_scores_once_twice += 1
+    #         nb_bad_scores_reads_once_twice += dict_pIDs_occ_once_twice[pID]
+    #     else:
+    #         count_good_scores_once_twice += 1
+    #         nb_good_scores_reads_once_twice += dict_pIDs_occ_once_twice[pID
+                                                                        ]
+
+    #print 'PIDS OCC >= 3:'
     print '****'
+    print '--RUN-barcode: '+str(true_seq_id)+': #bad align scores: '+str(count_bad_scores)
+    print '--RUN-barcode: '+str(true_seq_id)+': #bad align scores (reads): '+str(nb_bad_scores_reads)
+    print '--RUN-barcode: '+str(true_seq_id)+': #good align scores: '+str(count_good_scores)
+    print '--RUN-barcode: '+str(true_seq_id)+': #good align scores (reads): '+str(nb_good_scores_reads)
+
+    # print '1 <= PIDS OCC <= 2:'
+    # print '--RUN '+str(RUN)+': '+barcode+': #bad align scores: '+str(count_bad_scores_once_twice)
+    # print '--RUN '+str(RUN)+': '+barcode+': #bad align scores (reads): '+str(nb_bad_scores_reads_once_twice)
+    # print '--RUN '+str(RUN)+': '+barcode+': #good align scores: '+str(count_good_scores_once_twice)
+    # print '--RUN '+str(RUN)+': '+barcode+': #good align scores (reads): '+str(nb_good_scores_reads_once_twice)
+    
 
 
     # 6.check contamination
@@ -123,78 +127,81 @@ if (len(sys.argv)==5):
     dict_reclassif={}
     dict_count_reclassif_pIDs=defaultdict(int)
     dict_count_reclassif_reads=defaultdict(int)
-    #for bc in barcodes[prefix_date_and_id[RUN-1]]:
-    print 'RUN '+str(RUN)+' - barcode '+barcode+':'
-    #dict_possible_good_align={}
-    #dict_reclassif={}
-    #dict_count_reclassif_pIDs=defaultdict(int)
-    #dict_count_reclassif_reads=defaultdict(int)
-    index_bc = barcodes[prefix_date_and_id[RUN-1]].index(bc)#index of the barcode bc in the barcodes list
-    #barcodes_to_test = barcodes[prefix_date_and_id[RUN-1]][:index_bc]+barcodes[prefix_date_and_id[RUN-1]][index_bc+1:]#list of barcodes - {bc}
-    barcodes_to_test = [i for i in dict_ref_seq.keys() if (i!=(RUN,barcode))]
-
-    # loop for pIDs with occ >= 3
-    print 'loop for pIDs with occ >= 3'
-    for pID in dict_score_alignments.keys():
-        if (dict_score_alignments[pID] < dict_len_ref_seq[(RUN,barcode)]-DIST_MAX):
-            # if the alignment score of cons. seq. for pID with the reference sequence for bc is lower than the threshold
-            print '---found 1 bad alignment: '+pID+', '+str(dict_score_alignments[pID])+' < '+str(dict_len_ref_seq[(RUN,barcode)]-DIST_MAX)
-            dict_possible_good_align[pID]=[]
-            dict_reclassif[pID]=[]
+    print 'RUN - barcode: '+str(true_seq_id)+':'
+    #index_bc = barcodes[prefix_date_and_id[RUN-1]].index(bc)#index of the barcode bc in the barcodes list
+    barcodes_to_test = [ref_seq_bc for ref_seq_bc in dict_ref_seq.keys() if (ref_seq_bc != true_seq_id)]
 
-            for (run2,bc2) in barcodes_to_test: # compare the alignment scores of cons. seq. pID with the ref. seq. for the other barcodes
-                score_align_with_bc2 = lt.align_SW_dist(dict_ref_seq[(run2,bc2)], dict_cons_seq[pID])[2]
-                dict_possible_good_align[pID].append((run2,bc2,score_align_with_bc2))
-                if(score_align_with_bc2 >= dict_len_ref_seq[(run2,bc2)]-DIST_MAX):
-                    print '------possible reclassif of pID '+pID+' (#occ: '+str(dict_pIDs_occ[pID])+') in run '+str(run2) +', bc2 '+bc2+', with align score: '+str(score_align_with_bc2)+' >= '+str(dict_len_ref_seq[(run2,bc2)]-DIST_MAX)
-                    dict_count_reclassif_pIDs[(run2,bc2)]+=1
-                    dict_count_reclassif_reads[(run2,bc2)]+=dict_pIDs_occ[pID]
-                    dict_reclassif[pID].append((run2,bc2,score_align_with_bc2))
-
-            #for bc3 in barcodes[prefix_date_and_id[RUN%2]]: # compare the alignment scores of cons. seq. pID with the ref. seq. for the barcodes of the other run
-            #    score_align_with_bc3 = align_SW_dist(dict_scsp[((RUN%2)+1,bc3)], dict_cons_seq[(RUN,bc)][pID])[2]
-            #    dict_possible_good_align[(RUN,bc)][pID].append(((RUN%2)+1,bc3,score_align_with_bc3))
-            #    if(score_align_with_bc3 >= dict_lim_min_score_align[((RUN%2)+1, bc3)]):
-            #        print '------possible reclassif of pID '+pID+' (#occ: '+str(dict_pIDs_occ[(RUN,bc)][pID])+') in run '+str((RUN%2)+1)+', bc3 '+bc3+', with align score: '+str(score_align_with_bc3)+' >= '+str(dict_lim_min_score_align[((RUN%2)+1, bc3)])
-            #        dict_count_reclassif_pIDs[(RUN,bc)][((RUN%2)+1,bc3)]+=1
-            #        dict_count_reclassif_reads[(RUN,bc)][((RUN%2)+1,bc3)]+=dict_pIDs_occ[(RUN,bc)][pID]
-            #        dict_reclassif[(RUN,bc)][pID].append(((RUN%2)+1,bc3,score_align_with_bc3))
-
-
-    # loop for pIDs with 1 <= occ <= 2
-    print 'loop for pIDs with 1 <= occ <=2'
-    for pID in dict_score_alignments_once_twice.keys():
-        if (max(dict_score_alignments_once_twice[pID]) < dict_len_ref_seq[(RUN,barcode)]-DIST_MAX):
-            # if the alignment score (occ=1) or the best one (occ=2) for pID with the reference sequence for bc is lower than the threshold
-            print '---found 1 bad alignment: '+pID+', '+str(max(dict_score_alignments_once_twice[pID]))+' < '+str(dict_len_ref_seq[(RUN,barcode)]-DIST_MAX)
-            dict_possible_good_align[pID]=[]
-            dict_reclassif[pID]=[]
-            #nb_occ_cur_pID = len(dict_filtered_reads_pIDs_once_twice[(RUN,bc)][pID])      
-            nb_occ_cur_pID = sum([dict_pIDs_occ_once_twice[pID][i] for i in [0,1]])
+    
 
-            for bc2 in barcodes_to_test: # compare the alignment scores of the sequence(s) of pID with the ref. seq. for the other barcodes
-                score_align_with_bc2 = max([lt.align_SW_dist(dict_ref_seq[(run2,bc2)], dict_filtered_reads_pIDs_once_twice[pID][i])[2] for i in range(nb_occ_cur_pID)])
-                dict_possible_good_align[pID].append((run2,bc2,score_align_with_bc2))
-                if(score_align_with_bc2 >= dict_len_ref_seq[(run2,bc2)]-DIST_MAX):
-                    print '------possible reclassif of pID '+pID+' (#occ: '+str(nb_occ_cur_pID)+') in run '+str(run2) +', bc2 '+bc2+', with align score: '+str(score_align_with_bc2)+' >= '+str(dict_len_ref_seq[(run2,bc2)]-DIST_MAX)
-                    dict_count_reclassif_pIDs[(run2,bc2)]+=1
-                    dict_count_reclassif_reads[(run2,bc2)]+=nb_occ_cur_pID
-                    dict_reclassif[pID].append((run2,bc2,score_align_with_bc2))
 
+    for pID,align_scores in dict_score_alignments.iteritems():
+        if(max(align_scores) < len(dict_ref_seq[true_seq_id]) - DIST_MAX):
+            # if the alignment score of cons. seq. for pID with the reference sequence for bc is lower than the threshold
+            print '----found 1 bad alignment: '+pID+', '+str(dict_score_alignments[pID])+' < '+str(len(dict_ref_seq[true_seq_id])-DIST_MAX)
+            dict_possible_good_align[pID]=[]
+            dict_reclassif[pID]=[]
+            is_a_thrice_or_more = pID in dict_cons_seq.keys()
+            is_a_twice_with_distinct_reads = len(align_scores)==2
+            for ref_seq_id_2 in barcodes_to_test:# compare the alignment scores of cons. seq. with the ref. seq. for the other barcodes
+                if is_a_thrice_or_more:
+                    score_align_with_ref_seq_id_2 = lt.align_SW_dist(dict_ref_seq[ref_seq_id_2], dict_cons_seq[pID][2])[2]
+                elif is_a_twice_with_distinct_reads:
+                    score_align_with_ref_seq_id_2 = max([lt.align_SW_dist(dict_ref_seq[ref_seq_id_2], dict_all_reads[pID][i][2])[2] for i in [0,1]])
+                else: # is a once
+                    score_align_with_ref_seq_id_2 = lt.align_SW_dist(dict_ref_seq[ref_seq_id_2], dict_all_reads[pID][0][2])[2]
+
+                dict_possible_good_align[pID].append((ref_seq_id_2,score_align_with_ref_seq_id_2))
+                if(score_align_with_ref_seq_id_2 >= len(dict_ref_seq[ref_seq_id_2])-DIST_MAX):
+                    print '------possible reclassif of pID '+pID+' in ref_seq_id_2 '+str(ref_seq_id_2)+', with align score: '+str(score_align_with_ref_seq_id_2)+' >= '+str(len(dict_ref_seq[ref_seq_id_2])-DIST_MAX)
+                    dict_count_reclassif_pIDs[ref_seq_id_2]+=1
+                    dict_count_reclassif_reads[ref_seq_id_2]+=sum([sum(map(int,np.array(dict_all_reads[pID])[:,i])) for i in [0,1]]) #dict_pIDs_occ[pID]
+                    dict_reclassif[pID].append((ref_seq_id_2,score_align_with_ref_seq_id_2))
+
+    # # loop for pIDs with occ >= 3
+    # # print 'loop for pIDs with occ >= 3'
+    # for pID in dict_score_alignments.keys():
+    #     if (dict_score_alignments[pID] < len(dict_ref_seq[true_seq_id])-DIST_MAX):
+    #         # if the alignment score of cons. seq. for pID with the reference sequence for bc is lower than the threshold
+    #         print '---found 1 bad alignment: '+pID+', '+str(dict_score_alignments[pID])+' < '+str(dict_len_ref_seq[(RUN,barcode)]-DIST_MAX)
+    #         dict_possible_good_align[pID]=[]
+    #         dict_reclassif[pID]=[]
+
+    #         for (run2,bc2) in barcodes_to_test: # compare the alignment scores of cons. seq. pID with the ref. seq. for the other barcodes
+    #             score_align_with_bc2 = lt.align_SW_dist(dict_ref_seq[(run2,bc2)], dict_cons_seq[pID])[2]
+    #             dict_possible_good_align[pID].append((run2,bc2,score_align_with_bc2))
+    #             if(score_align_with_bc2 >= dict_len_ref_seq[(run2,bc2)]-DIST_MAX):
+    #                 print '------possible reclassif of pID '+pID+' (#occ: '+str(dict_pIDs_occ[pID])+') in run '+str(run2) +', bc2 '+bc2+', with align score: '+str(score_align_with_bc2)+' >= '+str(dict_len_ref_seq[(run2,bc2)]-DIST_MAX)
+    #                 dict_count_reclassif_pIDs[(run2,bc2)]+=1
+    #                 dict_count_reclassif_reads[(run2,bc2)]+=dict_pIDs_occ[pID]
+    #                 dict_reclassif[pID].append((run2,bc2,score_align_with_bc2))
 
-            #for bc3 in barcodes[prefix_date_and_id[RUN%2]]: # compare the alignment scores of the sequence(s) of pID with the ref. seq. for the barcodes of the other run
-            #    score_align_with_bc3 = max([align_SW_dist(dict_scsp[((RUN%2)+1,bc3)], dict_filtered_reads_pIDs_once_twice[(RUN,bc)][pID][i])[2] for i in range(nb_occ_cur_pID)])
-            #    dict_possible_good_align[(RUN,bc)][pID].append(((RUN%2)+1,bc3,score_align_with_bc3))
-            #    if(score_align_with_bc3 >= dict_lim_min_score_align[((RUN%2)+1, bc3)]):
-            #        print '------possible reclassif of pID '+pID+' (#occ: '+str(nb_occ_cur_pID)+') in run '+str((RUN%2)+1)+', bc3 '+bc3+', with align score: '+str(score_align_with_bc3)+' >= '+str(dict_lim_min_score_align[((RUN%2)+1, bc3)])
-            #        dict_count_reclassif_pIDs[(RUN,bc)][((RUN%2)+1,bc3)]+=1
-            #        dict_count_reclassif_reads[(RUN,bc)][((RUN%2)+1,bc3)]+=dict_pIDs_occ_once_twice[(RUN,bc)][pID]
-            #        dict_reclassif[(RUN,bc)][pID].append(((RUN%2)+1,bc3,score_align_with_bc3))
+    
+    # # loop for pIDs with 1 <= occ <= 2
+    # print 'loop for pIDs with 1 <= occ <=2'
+    # for pID in dict_score_alignments_once_twice.keys():
+    #     if (max(dict_score_alignments_once_twice[pID]) < dict_len_ref_seq[(RUN,barcode)]-DIST_MAX):
+    #         # if the alignment score (occ=1) or the best one (occ=2) for pID with the reference sequence for bc is lower than the threshold
+    #         print '---found 1 bad alignment: '+pID+', '+str(max(dict_score_alignments_once_twice[pID]))+' < '+str(dict_len_ref_seq[(RUN,barcode)]-DIST_MAX)
+    #         dict_possible_good_align[pID]=[]
+    #         dict_reclassif[pID]=[]
+    #         #nb_occ_cur_pID = len(dict_filtered_reads_pIDs_once_twice[(RUN,bc)][pID])      
+    #         nb_occ_cur_pID = sum([dict_pIDs_occ_once_twice[pID][i] for i in [0,1]])
+
+    #         for bc2 in barcodes_to_test: # compare the alignment scores of the sequence(s) of pID with the ref. seq. for the other barcodes
+    #             score_align_with_bc2 = max([lt.align_SW_dist(dict_ref_seq[(run2,bc2)], dict_filtered_reads_pIDs_once_twice[pID][i])[2] for i in range(nb_occ_cur_pID)])
+    #             dict_possible_good_align[pID].append((run2,bc2,score_align_with_bc2))
+    #             if(score_align_with_bc2 >= dict_len_ref_seq[(run2,bc2)]-DIST_MAX):
+    #                 print '------possible reclassif of pID '+pID+' (#occ: '+str(nb_occ_cur_pID)+') in run '+str(run2) +', bc2 '+bc2+', with align score: '+str(score_align_with_bc2)+' >= '+str(dict_len_ref_seq[(run2,bc2)]-DIST_MAX)
+    #                 dict_count_reclassif_pIDs[(run2,bc2)]+=1
+    #                 dict_count_reclassif_reads[(run2,bc2)]+=nb_occ_cur_pID
+    #                 dict_reclassif[pID].append((run2,bc2,score_align_with_bc2))
 
 
 
     #7.print contamination statistics
-
+####
+# CONTINUE HERE
+####
     # for bc in barcodes[prefix_date_and_id[RUN-1]]:
     count_pIDs_reclassif = 0
     count_pIDs_no_good_alignments = 0
@@ -205,27 +212,18 @@ if (len(sys.argv)==5):
         else:
             count_pIDs_no_good_alignments +=1
             if pID in dict_pIDs_occ.keys():
-                count_reads_no_good_alignments += dict_pIDs_occ[pID]
+                count_reads_no_good_alignments += dict_pIDs_occ[pID][0] + dict_pIDs_occ[pID][1]
             else:
-                count_reads_no_good_alignments += sum([dict_pIDs_occ_once_twice[pID][i] for i in [0,1]]) #dict_pIDs_occ_once_twice[(RUN,bc)][pID]
+                count_reads_no_good_alignments += dict_pIDs_occ_once_twice[pID]
 
     print 'RUN '+str(RUN)+' - barcode '+barcode+': #pIDs with better alignments from other barcodes: '+str(count_pIDs_reclassif)
     print '-----> #pIDs with no good alignments: '+ str(count_pIDs_no_good_alignments)
     print '-----> #reads with no good alignments: '+ str(count_reads_no_good_alignments)
     print '##########'
-    #for bc in barcodes[prefix_date_and_id[RUN-1]]:
-    #print 'RUN '+str(RUN)+' - barcode '+barcode+':'
-    #print '--> #pIDs from:  '+str(dict_count_reclassif_pIDs[(RUN,bc)])
-    #print '--> #reads from: '+str(dict_count_reclassif_reads[(RUN,bc)])
-
 
 
     #8.decontaminate the filtered reads (neighbors-indels) file
-    #dict_count_pIDs_to_bad_aligned_file={}
-    #dict_count_pIDs_from_contamination={}
-    #dict_count_pIDs_to_decontaminated_file={}
 
-    # for bc in barcodes[prefix_date_and_id[RUN-1]]:
     dict_count_pIDs_to_bad_aligned_file=defaultdict(int)
     dict_count_pIDs_from_contamination=defaultdict(int)
     dict_count_pIDs_to_decontaminated_file=defaultdict(int)
index 511caad..4bbb1a1 100644 (file)
@@ -32,11 +32,11 @@ DIST_MAX = 3
 # minimum #times a pID must occur to have neighbors 
 MIN_OCC_MOST_ABUND_PID = 10
 
-min_pid_alignment_score = 8.4
-# anyting >9 does not allow any substitions
-# anythin >8.4 allows only one point mutation
-# >8 allows one indel and one substitution
-
+max_pid_alignment_dist = 1.6
+# anything <1 does not allow any substitions
+# 1 allows only one point mutation
+# 1.6 allows one indel and one substitution
+# alignment parameters: match=1, unmatch=0, opening gap penalty=-0.6, gap extension penalty=-0.3
 
 
 ########
@@ -49,7 +49,7 @@ if (len(sys.argv) == 3):
     barcode_dir = str(sys.argv[1]).rstrip('/')+'/'
     readtype = sys.argv[2]
       
-    # 1. Create the dictionary of pIDs (from the output files of p1_trim_and_filter)
+    # Create the dictionary of pIDs (from the output files of p1_trim_and_filter)
 
     ###################################
     # -- READ THE FILTERED READS FILE #
@@ -116,7 +116,7 @@ if (len(sys.argv) == 3):
              if((dict_neighbor_state_pIDs[pID_r[1]] == False) and (pID_r[0] < pID_a[0])):
                 pIDs_align_score = align.localms(pID_a[1], pID_r[1], 1, 0, -0.6, -0.3)
                 if (len(pIDs_align_score) != 0): # if pID_a[1] and pID_r[1] can be aligned
-                    if (pIDs_align_score[0][2] >= min_pid_alignment_score):
+                    if (pIDs_align_score[0][2] >= len(pID_a[1]) - max_pid_alignment_dist):
                         print 'pID_a: '+str(pID_a)+', pID_r: '+str(pID_r)
                         print pIDs_align_score[0][0], pIDs_align_score[0][1]
                         print dict_neighbor_state_pIDs[pID_a[1]], dict_neighbor_state_pIDs[pID_r[1]]
@@ -128,13 +128,18 @@ if (len(sys.argv) == 3):
                             neighbor_candidates = dict_all_reads[pID_r[1]]
 
 
-                        if (pID_r[0] != 2): # pID_r occurs once or more than twice: only one sequence comparison to do
-                            seq_a = dict_cons_seq[pID_a[1]][0][2] 
-                            if(pID_r[0] == 1): # pID_r occurs once: its sequence is in dict_all_reads
+                        seq_a = dict_cons_seq[pID_a[1]][0][2] 
+                        #if (pID_r[0] != 2 or (pID_r[0] == 2 and len(dict_all_reads[pID_r[1]])==1)): # pID_r occurs once or more than twice, or twice with identical reads: only one sequence comparison to do
+                            #if(pID_r[0] <= 2): # pID_r occurs once or twice (with identical reads): its sequence is in dict_all_reads
                                 # seq_r = dict_all_reads[pID_r[0]][0][2]
-                                seq_r = dict_all_reads[pID_r[1]][0][2] # [nb_fwd,nb_rev,seq]
-                            else: # pID_r occurs at least 3 times: its consensus sequence is in dict_cons_seq
-                                seq_r = dict_cons_seq[pID_r[1]][0][2] # [nb_fwd,nb_rev,seq]
+                                #seq_r = dict_all_reads[pID_r[1]][0][2] # [nb_fwd,nb_rev,seq]
+                            #else: # pID_r occurs at least 3 times: its consensus sequence is in dict_cons_seq
+                                #seq_r = dict_cons_seq[pID_r[1]][0][2] # [nb_fwd,nb_rev,seq]
+                        #else: # pID_r occurs twice with distinct reads: if one read aligns well: pID_r is considered as a mutant
+                            #[seq_r_0,seq_r_1] = [dict_all_reads[pID_r[1]][i][2] for i in [0,1]]
+                            
+                        #if(pID_)
+
 
                         for nf, nr, seq_r in neighbor_candidates:
                             if lt.check_neighbor_plausibility(seq_a, seq_r, DIST_MAX):
@@ -147,8 +152,8 @@ if (len(sys.argv) == 3):
 
     print 'neighbors:'
     nb_neighbors_stated = 0
-    for pID in dict_neighbor_state_pIDs.keys():
-        if dict_neighbor_state_pIDs[pID]==True:
+    for pID,state in dict_neighbor_state_pIDs.iteritems():
+        if state:
             nb_neighbors_stated += 1
             print pID
 
@@ -160,7 +165,6 @@ if (len(sys.argv) == 3):
     count_reads_written = 0
     count_neighbors_reads_written = 0
 
-    #with open(filtered_reads_file_name[:-3]+'_neighbors_indels.fa', 'w') as neighbors_filtered_reads_file:
     # write the new filtered reads file (mutant-indels):
     corrected_aligned_reads_fname = barcode_dir+'corrected_reads.fasta'
     with open(corrected_aligned_reads_fname, 'w') as neighbors_filtered_reads_file:
@@ -170,9 +174,9 @@ if (len(sys.argv) == 3):
 
                 # write the reads corresponding to pID
                 for cur_read in dict_all_reads[pID]: # cur_read is a list [nb_occ_fwd,nb_occ_rev,seq]
-                    pid_orig = pID
-                    mut_flag = ''
-                    new_read_id = str(pID+'_'+str(cur_read[0])+'_'+str(cur_read[1])+'_'+pid_orig+mut_flag)
+                    #pid_orig = pID
+                    #mut_flag = ''
+                    new_read_id = str(pID+'_'+str(cur_read[0])+'_'+str(cur_read[1])+'_'+pID)
                     neighbors_filtered_reads_file.write('>'+new_read_id+'\n') # write the new id of the read
                     neighbors_filtered_reads_file.write(cur_read[2]+'\n') # write the sequence
                     count_reads_written += (cur_read[0]+cur_read[1])