debug p2_sort, p7_decontamination, p8_detect_mutants_indels; added a function remove_...
authorebenard <ebenard@ag-neher-benard.(none)>
Tue, 24 Sep 2013 12:32:19 +0000 (14:32 +0200)
committerebenard <ebenard@ag-neher-benard.(none)>
Tue, 24 Sep 2013 12:32:19 +0000 (14:32 +0200)
src/lib_tools.py
src/p2_sort.py
src/p4_consensus.py
src/p7_decontamination.py [changed mode: 0644->0755]
src/p8_detect_mutants_indels.py [changed mode: 0644->0755]

index a07f1c8..856e665 100644 (file)
@@ -115,3 +115,6 @@ def check_neighbor_plausibility(seq1, seq2, distance_cutoff, verbose = False):
         print score[0]
         print score[1]
     return (score[2] >= len(seq1) - DIST_MAX)
+
+def remove_gaps(seq):
+    return seq.replace('-','')
index 6092138..aed6b82 100755 (executable)
@@ -60,24 +60,23 @@ if (len(sys.argv) ==3):
                 pID = str(record.id.split('_')[0])
                 dict_pIDs[pID].append((record.id,record.seq))
                 
-                all_pIDs = sorted(dict_pIDs.keys())
+        all_pIDs = sorted(dict_pIDs.keys())
 
         
-        if(len(all_pIDs)>0): #avoids creation of temp dirs from previous barcode if the current reads file is empty...
-            batch=0
-            for pii, pID in enumerate(all_pIDs):
-                if((pii%batchsize)==0):
-                    batch=pii/batchsize
-                    temp_pid_files_dir =bc_dir+'/temp_'+readtype+'_'+"{0:04d}".format(batch)
-                    print 'pii: '+str(pii)+', '+temp_pid_files_dir
-                    lt.check_and_create_directory(temp_pid_files_dir)
-                with open(temp_pid_files_dir+'/'+ pID+'.fasta', 'w') as output_pID_file:
-                    for read in dict_pIDs[pID]:
-                        output_pID_file.write(str('>'+read[0]+'\n'))
-                        output_pID_file.write(str(read[1]+'\n'))
-                        count+=1
-                        if(count%500==0):
-                            print 'count = ' + str(count)
+        batch=0
+        for pii, pID in enumerate(all_pIDs):
+            if((pii%batchsize)==0):
+                batch=pii/batchsize
+                temp_pid_files_dir =bc_dir+'/temp_'+readtype+'_'+"{0:04d}".format(batch)
+                print 'pii: '+str(pii)+', '+temp_pid_files_dir
+                lt.check_and_create_directory(temp_pid_files_dir)
+            with open(temp_pid_files_dir+'/'+ pID+'.fasta', 'w') as output_pID_file:
+                for read in dict_pIDs[pID]:
+                    output_pID_file.write(str('>'+read[0]+'\n'))
+                    output_pID_file.write(str(read[1]+'\n'))
+                    count+=1
+                    if(count%500==0):
+                        print 'count = ' + str(count)
                                     
         print 'total : ' + str(count)
         #else:
index 948ae53..04b88a6 100755 (executable)
@@ -44,7 +44,8 @@ def make_consensus(file_name):
         str_consensus = "".join(alpha[np.argmax(nuc_counts, axis=0)])
         
         #remove gaps
-        str_consensus_2 = "".join([nuc for nuc in str_consensus if nuc!='-'])
+        #str_consensus_2 = "".join([nuc for nuc in str_consensus if nuc!='-'])
+        str_consensus_2 = lt.remove_gaps(str_consensus)
 
         return (str_consensus_2, np.sum(nb_reads_fwd), np.sum(nb_reads_rev))    
     else:
@@ -83,7 +84,7 @@ if __name__=='__main__':
                         if consensus_seq[1]+consensus_seq[2]>2:
                             consensus_file.write('>'+lt.read_label(pID, consensus_seq[1], consensus_seq[2])+'\n')
                             consensus_file.write(consensus_seq[0]+'\n')
-
+                                
                     shutil.rmtree(temp_dir)
     else:
         print auto_file_name+': usage: '+auto_file_name+' <run directory> <read type to work on>'
old mode 100644 (file)
new mode 100755 (executable)
index 541a887..c9b1500
@@ -4,13 +4,7 @@
 #/!\#
 
 # script that decontaminates a filtered reads file for a given barcode and run id
-# input: - filtered reads file to decontaminate (in ../data/)
-#        - consensus sequences file to check contamination (in ../templates/<date-and-id>_consensus/)
-#        - reference sequences file to check contamination (in ../data/)
-#        - run id (int) corresponding to the associated filtered reads and consensus sequences files
-#
-# output:- decontaminated filtered reads file (in ../data/)
-#        - filtered reads file containing reads that align badly with all the reference sequences (in ../data/)
+
  
 import numpy as np
 from Bio import AlignIO
@@ -46,19 +40,23 @@ if (len(sys.argv)==5):
 
 
     #2.parse the consensus sequences files for the given prefix_date_and_id in dict_cons_seq
-    dict_cons_seq, a,b = lt.parse_readfile(barcode_dir+'consensus_sequence_'+readtype+'.fasta')
-
+    dict_cons_seq, a,b = lt.parse_readfile(barcode_dir+'consensus_sequences_'+readtype+'.fasta')
+    print dict_cons_seq
+    
+    #/!\ : aligned reads: need to trim the gaps at one point
     dict_all_reads, nreads, nbadreads = lt.parse_readfile(barcode_dir+'aligned_reads_'+readtype+'.fasta')
 
     dict_score_alignments = defaultdict(list)
     for pID in dict_all_reads:
         if pID in dict_cons_seq:
-            score_align = lt.align_SW_dist(dict_ref_seq[true_seq_id], dict_cons_seq[pID][2])[2]
+            print 'FLAG:'
+            print dict_cons_seq[pID]
+            score_align = lt.align_SW_dist(dict_ref_seq[true_seq_id], dict_cons_seq[pID][0][2])[2]
             print '----'+str(true_seq_id)+': '+pID+', align score: '+str(score_align)
             dict_score_alignments[pID].append(score_align)
         else:
             for nf, nb, seq in dict_all_reads[pID]:
-                score_align = lt.align_SW_dist(dict_ref_seq[true_seq_id], dict_cons_seq[pID][2])[2]
+                score_align = lt.align_SW_dist(dict_ref_seq[true_seq_id], seq)[2]
                 print '----'+str(true_seq_id)+': '+pID+', align score: '+str(score_align)
                 dict_score_alignments[pID].append(score_align)
 
@@ -78,48 +76,19 @@ if (len(sys.argv)==5):
     #5.count good and bad alignments
     
 
-    count_bad_scores = sum([x<len(ref_seq[true_seq_id])-DIST_MAX for x in reads for reads in dict_score_alignments])
-    nb_bad_scores_reads = sum([x<len(ref_seq[true_seq_id])-DIST_MAX for x in reads for reads in dict_score_alignments])
-    count_good_scores = sum([x>=len(ref_seq[true_seq_id])-DIST_MAX for x in reads for reads in dict_score_alignments])
-    nb_good_scores_reads = sum([x<len(ref_seq[true_seq_id])-DIST_MAX for x in reads for reads in dict_score_alignments])
-
-    # count_bad_scores_once_twice = 0
-    # nb_bad_scores_reads_once_twice = 0
-    # count_good_scores_once_twice = 0
-    # nb_good_scores_reads_once_twice = 0
-
-    # for pID in dict_score_alignments.keys():
-    #     if (dict_score_alignments[pID] <  dict_len_ref_seq[(RUN,barcode)]-DIST_MAX):
-    #         count_bad_scores += 1
-    #         nb_bad_scores_reads += dict_pIDs_occ[pID]
-    #     else:
-    #         count_good_scores += 1
-    #         nb_good_scores_reads += dict_pIDs_occ[pID]
-
-    # for pID in dict_score_alignments_once_twice.keys():
-    #     if (max(dict_score_alignments_once_twice[pID]) <  dict_len_ref_seq[(RUN,barcode)]-DIST_MAX):
-    #         count_bad_scores_once_twice += 1
-    #         nb_bad_scores_reads_once_twice += dict_pIDs_occ_once_twice[pID]
-    #     else:
-    #         count_good_scores_once_twice += 1
-    #         nb_good_scores_reads_once_twice += dict_pIDs_occ_once_twice[pID
-                                                                        ]
-
-    #print 'PIDS OCC >= 3:'
+    count_bad_scores = sum([sum([x<len(dict_ref_seq[true_seq_id])-DIST_MAX for x in reads]) for reads in dict_score_alignments])
+    nb_bad_scores_reads = sum([sum([x<len(dict_ref_seq[true_seq_id])-DIST_MAX for x in reads]) for reads in dict_score_alignments])
+    count_good_scores = sum([sum([x>=len(dict_ref_seq[true_seq_id])-DIST_MAX for x in reads]) for reads in dict_score_alignments])
+    nb_good_scores_reads = sum([sum([x<len(dict_ref_seq[true_seq_id])-DIST_MAX for x in reads]) for reads in dict_score_alignments])
+
+                                                                         
     print '****'
     print '--RUN-barcode: '+str(true_seq_id)+': #bad align scores: '+str(count_bad_scores)
     print '--RUN-barcode: '+str(true_seq_id)+': #bad align scores (reads): '+str(nb_bad_scores_reads)
     print '--RUN-barcode: '+str(true_seq_id)+': #good align scores: '+str(count_good_scores)
     print '--RUN-barcode: '+str(true_seq_id)+': #good align scores (reads): '+str(nb_good_scores_reads)
 
-    # print '1 <= PIDS OCC <= 2:'
-    # print '--RUN '+str(RUN)+': '+barcode+': #bad align scores: '+str(count_bad_scores_once_twice)
-    # print '--RUN '+str(RUN)+': '+barcode+': #bad align scores (reads): '+str(nb_bad_scores_reads_once_twice)
-    # print '--RUN '+str(RUN)+': '+barcode+': #good align scores: '+str(count_good_scores_once_twice)
-    # print '--RUN '+str(RUN)+': '+barcode+': #good align scores (reads): '+str(nb_good_scores_reads_once_twice)
-    
-
-
     # 6.check contamination
     print '********'
     print 'Check contamination'
@@ -144,7 +113,7 @@ if (len(sys.argv)==5):
             is_a_twice_with_distinct_reads = len(align_scores)==2
             for ref_seq_id_2 in barcodes_to_test:# compare the alignment scores of cons. seq. with the ref. seq. for the other barcodes
                 if is_a_thrice_or_more:
-                    score_align_with_ref_seq_id_2 = lt.align_SW_dist(dict_ref_seq[ref_seq_id_2], dict_cons_seq[pID][2])[2]
+                    score_align_with_ref_seq_id_2 = lt.align_SW_dist(dict_ref_seq[ref_seq_id_2], dict_cons_seq[pID][0][2])[2]
                 elif is_a_twice_with_distinct_reads:
                     score_align_with_ref_seq_id_2 = max([lt.align_SW_dist(dict_ref_seq[ref_seq_id_2], dict_all_reads[pID][i][2])[2] for i in [0,1]])
                 else: # is a once
@@ -157,128 +126,79 @@ if (len(sys.argv)==5):
                     dict_count_reclassif_reads[ref_seq_id_2]+=sum([sum(map(int,np.array(dict_all_reads[pID])[:,i])) for i in [0,1]]) #dict_pIDs_occ[pID]
                     dict_reclassif[pID].append((ref_seq_id_2,score_align_with_ref_seq_id_2))
 
-    # # loop for pIDs with occ >= 3
-    # # print 'loop for pIDs with occ >= 3'
-    # for pID in dict_score_alignments.keys():
-    #     if (dict_score_alignments[pID] < len(dict_ref_seq[true_seq_id])-DIST_MAX):
-    #         # if the alignment score of cons. seq. for pID with the reference sequence for bc is lower than the threshold
-    #         print '---found 1 bad alignment: '+pID+', '+str(dict_score_alignments[pID])+' < '+str(dict_len_ref_seq[(RUN,barcode)]-DIST_MAX)
-    #         dict_possible_good_align[pID]=[]
-    #         dict_reclassif[pID]=[]
-
-    #         for (run2,bc2) in barcodes_to_test: # compare the alignment scores of cons. seq. pID with the ref. seq. for the other barcodes
-    #             score_align_with_bc2 = lt.align_SW_dist(dict_ref_seq[(run2,bc2)], dict_cons_seq[pID])[2]
-    #             dict_possible_good_align[pID].append((run2,bc2,score_align_with_bc2))
-    #             if(score_align_with_bc2 >= dict_len_ref_seq[(run2,bc2)]-DIST_MAX):
-    #                 print '------possible reclassif of pID '+pID+' (#occ: '+str(dict_pIDs_occ[pID])+') in run '+str(run2) +', bc2 '+bc2+', with align score: '+str(score_align_with_bc2)+' >= '+str(dict_len_ref_seq[(run2,bc2)]-DIST_MAX)
-    #                 dict_count_reclassif_pIDs[(run2,bc2)]+=1
-    #                 dict_count_reclassif_reads[(run2,bc2)]+=dict_pIDs_occ[pID]
-    #                 dict_reclassif[pID].append((run2,bc2,score_align_with_bc2))
-
-    
-    # # loop for pIDs with 1 <= occ <= 2
-    # print 'loop for pIDs with 1 <= occ <=2'
-    # for pID in dict_score_alignments_once_twice.keys():
-    #     if (max(dict_score_alignments_once_twice[pID]) < dict_len_ref_seq[(RUN,barcode)]-DIST_MAX):
-    #         # if the alignment score (occ=1) or the best one (occ=2) for pID with the reference sequence for bc is lower than the threshold
-    #         print '---found 1 bad alignment: '+pID+', '+str(max(dict_score_alignments_once_twice[pID]))+' < '+str(dict_len_ref_seq[(RUN,barcode)]-DIST_MAX)
-    #         dict_possible_good_align[pID]=[]
-    #         dict_reclassif[pID]=[]
-    #         #nb_occ_cur_pID = len(dict_filtered_reads_pIDs_once_twice[(RUN,bc)][pID])      
-    #         nb_occ_cur_pID = sum([dict_pIDs_occ_once_twice[pID][i] for i in [0,1]])
-
-    #         for bc2 in barcodes_to_test: # compare the alignment scores of the sequence(s) of pID with the ref. seq. for the other barcodes
-    #             score_align_with_bc2 = max([lt.align_SW_dist(dict_ref_seq[(run2,bc2)], dict_filtered_reads_pIDs_once_twice[pID][i])[2] for i in range(nb_occ_cur_pID)])
-    #             dict_possible_good_align[pID].append((run2,bc2,score_align_with_bc2))
-    #             if(score_align_with_bc2 >= dict_len_ref_seq[(run2,bc2)]-DIST_MAX):
-    #                 print '------possible reclassif of pID '+pID+' (#occ: '+str(nb_occ_cur_pID)+') in run '+str(run2) +', bc2 '+bc2+', with align score: '+str(score_align_with_bc2)+' >= '+str(dict_len_ref_seq[(run2,bc2)]-DIST_MAX)
-    #                 dict_count_reclassif_pIDs[(run2,bc2)]+=1
-    #                 dict_count_reclassif_reads[(run2,bc2)]+=nb_occ_cur_pID
-    #                 dict_reclassif[pID].append((run2,bc2,score_align_with_bc2))
-
-
-
-    #7.print contamination statistics
-####
-# CONTINUE HERE
-####
-    # for bc in barcodes[prefix_date_and_id[RUN-1]]:
-    count_pIDs_reclassif = 0
-    count_pIDs_no_good_alignments = 0
-    count_reads_no_good_alignments = 0
-    for pID in dict_reclassif.keys():
-        if len(dict_reclassif[pID])>0:
-            count_pIDs_reclassif+=1
-        else:
-            count_pIDs_no_good_alignments +=1
-            if pID in dict_pIDs_occ.keys():
-                count_reads_no_good_alignments += dict_pIDs_occ[pID][0] + dict_pIDs_occ[pID][1]
-            else:
-                count_reads_no_good_alignments += dict_pIDs_occ_once_twice[pID]
-
-    print 'RUN '+str(RUN)+' - barcode '+barcode+': #pIDs with better alignments from other barcodes: '+str(count_pIDs_reclassif)
-    print '-----> #pIDs with no good alignments: '+ str(count_pIDs_no_good_alignments)
-    print '-----> #reads with no good alignments: '+ str(count_reads_no_good_alignments)
-    print '##########'
-
-
-    #8.decontaminate the filtered reads (neighbors-indels) file
+  
 
+    # 7.print contamination statistics
+    # 8.decontaminate the filtered reads file
     dict_count_pIDs_to_bad_aligned_file=defaultdict(int)
     dict_count_pIDs_from_contamination=defaultdict(int)
     dict_count_pIDs_to_decontaminated_file=defaultdict(int)
     list_of_reads_to_write_in_bad_aligned_file=[]
     list_of_reads_to_write_in_decontaminated_file=[]
-    with open(relative_filtered_reads_file_path, 'r') as infile:
-    #with open(str('../data/'+prefix_date_and_id[RUN-1]+'_'+bc+'_sorted_and_filtered_reads_SW_min_occ_same_pid_1_neighbors_indels.fa'), 'r') as infile:
-        for record in SeqIO.parse(infile,'fasta'):
-            pID = record.id.split('_')[0]
-            if (pID in dict_reclassif.keys()): # if the seq. for pID aligns badly with the ref. seq. of its barcode 
-                if not (len(dict_reclassif[pID])>0): # and if it does not align well with the ref. seq. of the other barcodes (run 1 and run 2)
-                    # don't know what to do with them: keep them in a file
+    list_of_reads_from_contamination=[]
+    count_reads_no_good_alignments = 0
+    count_reads_good_alignments = 0
+
+    with open(barcode_dir+readtype+'_reads.fasta','r') as filtered_reads_file:
+    #for pID,reads in dict_all_reads.iteritems():
+        for record in SeqIO.parse(filtered_reads_file,'fasta'):
+            pID,nb_reads_fwd,nb_reads_rev,pID_orig = record.id.split('_')[0:4]
+            nb_reads_fwd = int(nb_reads_fwd)
+            nb_reads_rev = int(nb_reads_rev)
+            read = record.seq
+            if pID in dict_reclassif.keys(): # bad alignment with the associated ref. seq.
+                if len(dict_reclassif[pID])>0: # contamination from an other run/barcode
+                    list_of_reads_from_contamination.append(record)
+                    dict_count_pIDs_from_contamination[pID]+=(nb_reads_fwd+nb_reads_rev)
+                else: # bad alignment with all the ref. seq.
                     list_of_reads_to_write_in_bad_aligned_file.append(record)
-                    dict_count_pIDs_to_bad_aligned_file[pID]+=1
-                else: # it aligns well with the ref. seq. of an other barcode (run 1 or run 2)
-                    # just throw it away
-                    dict_count_pIDs_from_contamination[pID]+=1
-            else: # the seq. for pID aligns well with the ref. seq. of its barcode
-                # to be written in the filtered reads (neighbors_indels) decontaminated file
+                    dict_count_pIDs_to_bad_aligned_file[pID]+=(nb_reads_fwd+nb_reads_rev)
+                    count_reads_no_good_alignments += (nb_reads_fwd+nb_reads_rev)
+            else: # good alignment with the associated ref. seq.
                 list_of_reads_to_write_in_decontaminated_file.append(record)
-                dict_count_pIDs_to_decontaminated_file[pID]+=1
-
-    # write the filtered reads (neighbors_indels) decontaminated file
-    suffix_filtered_reads_file = filtered_reads_file_basename.split('_')[-1]
-    with open(str(path_to_data+prefix_date_and_id+'-decontaminated'+'_'+barcode+suffix_filtered_reads_file), 'w') as outfile:
-        print 'write decontaminated file for run '+str(RUN)+', barcode: '+barcode
+                dict_count_pIDs_to_decontaminated_file[pID]+= (nb_reads_fwd+nb_reads_rev)
+                count_reads_good_alignments += (nb_reads_fwd+nb_reads_rev)
+    
+        
+   # print 'RUN-barcode '+true_seq_id+': #pIDs with better alignments from other barcodes: '+str(count_pIDs_reclassif)
+   # print '-----> #pIDs with no good alignments: '+ str(count_pIDs_no_good_alignments)
+   # print '-----> #reads with no good alignments: '+ str(count_reads_no_good_alignments)
+   # print '-----> #reads with good alignments: '+ str(count_reads_good_alignments)
+   # print '##########'
+
+    # write the filtered reads decontaminated file
+    with open(barcode_dir+readtype+'_decontaminated_reads.fasta','w') as outfile:
+        print 'write decontaminated file for run-barcode: '+str(true_seq_id)
         for record in list_of_reads_to_write_in_decontaminated_file:
             outfile.write('>'+str(record.id)+'\n')
             outfile.write(str(record.seq)+'\n')
         print '-- #reads written: '+str(len(list_of_reads_to_write_in_decontaminated_file))
-
-    # write the (neighbors_indels) bad aligned reads file
-    with open(str(path_to_data+prefix_date_and_id+'-bad-aligned'+'_'+barcode+suffix_filtered_reads_file), 'w') as outfile:
-        print 'write bad aligned reads file for run '+str(RUN)+', barcode: '+bc
+                
+    # write the bad aligned filtered reads file
+    with open(barcode_dir+readtype+'_bad_aligned_reads.fasta','w') as outfile:
+        print 'write bad aligned reads file for run-barcode: '+str(true_seq_id)
         for record in list_of_reads_to_write_in_bad_aligned_file:
             outfile.write('>'+str(record.id)+'\n')
             outfile.write(str(record.seq)+'\n')
         print '-- #reads written: '+str(len(list_of_reads_to_write_in_bad_aligned_file))
 
-    # 9. print statistics
+
+    # 9. print decontamination statistics
     print '#############'
-    #for bc in barcodes[prefix_date_and_id[RUN-1]]:
-    print '-RUN '+str(RUN)+', barcode '+bc
+    print '-RUN-barcode '+str(true_seq_id)
     print '---> nb non contaminant pIDs: '+str(len(dict_count_pIDs_to_decontaminated_file.keys()))
-    print '---> nb non contaminant reads: '+str(sum([dict_count_pIDs_to_decontaminated_file[i] for i in dict_count_pIDs_to_decontaminated_file.keys()]))
+    print '---> nb non contaminant reads: '+str(sum(dict_count_pIDs_to_decontaminated_file.values()))
     print '--'
     print '---> nb contaminant pIDs: '+str(len(dict_count_pIDs_from_contamination.keys()))
-    print '---> nb contaminant reads: '+str(sum([dict_count_pIDs_from_contamination[i] for i in dict_count_pIDs_from_contamination.keys()]))
+    print '---> nb contaminant reads: '+str(sum(dict_count_pIDs_from_contamination.values()))
     print '--'
     print '---> nb bad aligned but non contaminant pIDs : '+str(len(dict_count_pIDs_to_bad_aligned_file.keys()))
-    print '---> nb bad aligned but non contaminant reads: '+str(sum([dict_count_pIDs_to_bad_aligned_file[i] for i in dict_count_pIDs_to_bad_aligned_file.keys()]))
+    print '---> nb bad aligned but non contaminant reads: '+str(sum(dict_count_pIDs_to_bad_aligned_file.values()))
 
 
 else:
-    print auto_file_name+': usage: '+auto_file_name+' <filtered reads file (in ../data/)> <consensus seq. file (in ../templates/<date-and-id>_consensus/)> <ref. seq. file (in ../data/)> <run_number>'
+    #print auto_file_name+': usage: '+auto_file_name+' <filtered reads file (in ../data/)> <consensus seq. file (in ../templates/<date-and-id>_consensus/)> <ref. seq. file (in ../data/)> <run_number>'
+    print auto_file_name+': usage: '+auto_file_name+' <barcode_dir> <ref_seq_file_name> read_type true_seq_id'
 
 
  
old mode 100644 (file)
new mode 100755 (executable)
index 4bbb1a1..afe17c1
@@ -91,44 +91,46 @@ if (len(sys.argv) == 3):
     for pID in dict_all_reads.keys():
         dict_neighbor_state_pIDs[pID] = False
 
-    list_occ_pIDs_dec = sorted(pID_abundance.items(), key = lambda x:x[0], reverse=True)
+    list_occ_pIDs_dec = sorted(pID_abundance.items(), key = lambda x:x[1], reverse=True)
     list_occ_pIDs_inc = list_occ_pIDs_dec[::-1]
 
     print 'Check list_occ_pIDs_dec and list_occ_pIDs_inc lengths (should be equal): '        
     print len(list_occ_pIDs_dec), len(list_occ_pIDs_inc)
-
+    #print list_occ_pIDs_dec
+    #print list_occ_pIDs_inc
     # dictionary containing the parent pID (values) of a given pID (keys)
     dict_neighbor_of = defaultdict(list)
     dict_neighbors = defaultdict(list)
 
-    lim_pID_a = [list_occ_pIDs_inc[i][0]>=MIN_OCC_MOST_ABUND_PID for i in range(len(list_occ_pIDs_inc))].count(True)
+    #lim_pID_a = [list_occ_pIDs_inc[i][0]>=MIN_OCC_MOST_ABUND_PID for i in range(len(list_occ_pIDs_inc))].count(True)
+    lim_pID_a = [i[1]>=MIN_OCC_MOST_ABUND_PID for i in list_occ_pIDs_inc].count(True)
     print 'lim_pID_a:'+str(lim_pID_a)
-    print 'nb occ pID_a at pos lim_pID_a-1 in list_occ_pIDs_dec: '+str(list_occ_pIDs_dec[lim_pID_a-1][0])
-    print 'nb occ pID_a at pos lim_pID_a in list_occ_pIDs_dec: '+str(list_occ_pIDs_dec[lim_pID_a][0])
+    print 'nb occ pID_a at pos lim_pID_a-1 in list_occ_pIDs_dec: '+str(list_occ_pIDs_dec[lim_pID_a-1][1])
+    print 'nb occ pID_a at pos lim_pID_a in list_occ_pIDs_dec: '+str(list_occ_pIDs_dec[lim_pID_a][1])
 
     for ii_a, pID_a in enumerate(list_occ_pIDs_dec[:lim_pID_a]):
         # this condition should always evaluate to true
-        if dict_neighbor_state_pIDs[pID_a[1]] == False:
+        if dict_neighbor_state_pIDs[pID_a[0]] == False:
             continue
         # loop over rare pid
         for ii,pID_r in enumerate(list_occ_pIDs_inc[:-(ii_a+1)]):
             # check whether this PID is already a neighbor of some other
-             if((dict_neighbor_state_pIDs[pID_r[1]] == False) and (pID_r[0] < pID_a[0])):
-                pIDs_align_score = align.localms(pID_a[1], pID_r[1], 1, 0, -0.6, -0.3)
+             if((dict_neighbor_state_pIDs[pID_r[0]] == False) and (pID_r[1] < pID_a[1])):
+                pIDs_align_score = align.localms(pID_a[0], pID_r[0], 1, 0, -0.6, -0.3)
                 if (len(pIDs_align_score) != 0): # if pID_a[1] and pID_r[1] can be aligned
-                    if (pIDs_align_score[0][2] >= len(pID_a[1]) - max_pid_alignment_dist):
+                    if (pIDs_align_score[0][2] >= len(pID_a[0]) - max_pid_alignment_dist):
                         print 'pID_a: '+str(pID_a)+', pID_r: '+str(pID_r)
                         print pIDs_align_score[0][0], pIDs_align_score[0][1]
-                        print dict_neighbor_state_pIDs[pID_a[1]], dict_neighbor_state_pIDs[pID_r[1]]
+                        print dict_neighbor_state_pIDs[pID_a[0]], dict_neighbor_state_pIDs[pID_r[0]]
                         print "possible neighbors"
 
-                        if pID_r[0]>2: 
-                            neighbor_candidates = dict_cons_seq[pID_r[1]]
+                        if pID_r[1]>2: 
+                            neighbor_candidates = dict_cons_seq[pID_r[0]]
                         else:
-                            neighbor_candidates = dict_all_reads[pID_r[1]]
+                            neighbor_candidates = dict_all_reads[pID_r[0]]
 
 
-                        seq_a = dict_cons_seq[pID_a[1]][0][2] 
+                        seq_a = dict_cons_seq[pID_a[0]][0][2] 
                         #if (pID_r[0] != 2 or (pID_r[0] == 2 and len(dict_all_reads[pID_r[1]])==1)): # pID_r occurs once or more than twice, or twice with identical reads: only one sequence comparison to do
                             #if(pID_r[0] <= 2): # pID_r occurs once or twice (with identical reads): its sequence is in dict_all_reads
                                 # seq_r = dict_all_reads[pID_r[0]][0][2]
@@ -143,9 +145,9 @@ if (len(sys.argv) == 3):
 
                         for nf, nr, seq_r in neighbor_candidates:
                             if lt.check_neighbor_plausibility(seq_a, seq_r, DIST_MAX):
-                                dict_neighbors[pID_a[1]].append(pID_r[1])
-                                dict_neighbor_of[pID_r[1]] = pID_a[1]
-                                dict_neighbor_state_pIDs[pID_r[1]] = True
+                                dict_neighbors[pID_a[0]].append(pID_r[0])
+                                dict_neighbor_of[pID_r[0]] = pID_a[0]
+                                dict_neighbor_state_pIDs[pID_r[0]] = True
                                 print "NEIGHBORS !"
                                 break
 
@@ -166,7 +168,7 @@ if (len(sys.argv) == 3):
     count_neighbors_reads_written = 0
 
     # write the new filtered reads file (mutant-indels):
-    corrected_aligned_reads_fname = barcode_dir+'corrected_reads.fasta'
+    corrected_aligned_reads_fname = barcode_dir+'corrected_aligned_reads.fasta'
     with open(corrected_aligned_reads_fname, 'w') as neighbors_filtered_reads_file:
         for pID in dict_all_reads.keys(): # for all pIDs                                
             if not dict_neighbor_state_pIDs[pID]: # if pID "is not a neighbor"