renamed files formerly p8 and p7, removed p6 and now part of p4
authorRichard <richard.neher@tuebingen.mpg.de>
Thu, 26 Sep 2013 16:28:14 +0000 (18:28 +0200)
committerRichard <richard.neher@tuebingen.mpg.de>
Thu, 26 Sep 2013 16:28:14 +0000 (18:28 +0200)
src/p5_decontaminate_all.py [new file with mode: 0644]
src/p5_decontamination.py [new file with mode: 0755]
src/p6_align_global_per_barcode.py [deleted file]
src/p6_correct.all [new file with mode: 0644]
src/p6_detect_mutants_indels.py [new file with mode: 0755]
src/p7_decontaminate_all.py [deleted file]
src/p7_decontamination.py [deleted file]
src/p8_correct_all.py [deleted file]
src/p8_detect_mutants_indels.py [deleted file]

diff --git a/src/p5_decontaminate_all.py b/src/p5_decontaminate_all.py
new file mode 100644 (file)
index 0000000..b959b07
--- /dev/null
@@ -0,0 +1,29 @@
+#!/usr/bin/python
+
+import os
+import sys
+import time
+import glob 
+
+####
+# retrieve the temp reads file names to align in the specific cluster directory (in ../templates/) given in input
+# the given cluster directory path must be relative to the current directory (src)
+####
+if(len(sys.argv)==2):
+    run_dir = str(sys.argv[1])
+    run_dir = run_dir.rstrip('/')+'/'
+
+    run  = run_dir[-2]
+    barcode_dir_list = glob.glob(run_dir+'bc_*_analysis')
+    for analysis_dir in barcode_dir_list:
+        barcode = analysis_dir.split('_')[-2]
+        #collect the list of temp reads files to align
+        cmd = 'qsub -cwd -l h_rt=00:59:00 -l h_vmem=10G ./src/p7_decontamination.py '+analysis_dir+' ./data/ref_seq_for_decontamination.fasta filtered '+'"'+run+' '+barcode+'"'
+        print cmd
+        os.system(cmd)
+
+else:
+    print sys.argv[1]+': usage: '+auto_file_name+' <directory of a run>'
+
+
+
diff --git a/src/p5_decontamination.py b/src/p5_decontamination.py
new file mode 100755 (executable)
index 0000000..1d123c6
--- /dev/null
@@ -0,0 +1,193 @@
+#!/usr/bin/python
+#/!\#
+#### /ebio/ag-neher/share/pograms/EPD/bins/python
+#/!\#
+
+# script that decontaminates a filtered reads file for a given barcode and run id
+
+import numpy as np
+from Bio import AlignIO
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.Align import MultipleSeqAlignment
+from collections import Counter
+from collections import defaultdict
+import sys
+import datetime
+import time
+sys.path.append('./src/')
+import lib_tools as lt
+
+
+auto_file_name=str(sys.argv[0])
+
+DIST_MAX = 7
+
+if (len(sys.argv)==5):
+
+    # parse the input file names
+    barcode_dir = str(sys.argv[1]).rstrip('/')+'/'
+    ref_seq_file_name = str(sys.argv[2])
+    readtype = sys.argv[3]
+    true_seq_id = sys.argv[4]
+
+    # parse the reference sequences file in dict_ref_seq
+
+    dict_ref_seq ={}
+    with open(ref_seq_file_name, 'r') as infile:
+        for seq in SeqIO.parse(infile, 'fasta'):
+            dict_ref_seq[seq.description]=str(seq.seq)
+
+
+    #2.parse the consensus sequences files for the given prefix_date_and_id in dict_cons_seq
+    dict_cons_seq, a,b = lt.parse_readfile(barcode_dir+'consensus_sequences_'+readtype+'.fasta')
+    
+    
+    #/!\ : aligned reads: need to trim the gaps at one point
+    dict_all_reads, nreads, nbadreads = lt.parse_readfile(barcode_dir+'aligned_reads_'+readtype+'.fasta')
+
+    dict_score_alignments = defaultdict(list)
+    for pID in dict_all_reads:
+        if pID in dict_cons_seq:
+            #print 'FLAG:'
+            #print dict_cons_seq[pID]
+            tmp_cons_seq = dict_cons_seq[pID][0][2]
+            score_align = lt.align_SW_dist(dict_ref_seq[true_seq_id], tmp_cons_seq)[2]
+            print '----'+str(true_seq_id)+': '+pID+', align score: '+str(score_align)
+            dict_score_alignments[pID].append(score_align)
+        else:
+            for nf, nb, seq in dict_all_reads[pID]:
+                score_align = lt.align_SW_dist(dict_ref_seq[true_seq_id], lt.remove_gaps(seq))[2]
+                print '----'+str(true_seq_id)+': '+pID+', align score: '+str(score_align)
+                dict_score_alignments[pID].append(score_align)
+
+
+    #4.print some statistics
+    print '** LIM SCORE ALIGN-'+true_seq_id
+    #print '---- min = '+str(min([dict_score_alignments[i] for i in dict_score_alignments.keys()]))
+    print '---- min = '+str(min(np.hstack(dict_score_alignments.values())))
+    #print '---- max = '+str(max([dict_score_alignments[i] for i in dict_score_alignments.keys()]))
+    print '---- min = '+str(max(np.hstack(dict_score_alignments.values())))
+    #average_scores=np.mean([dict_score_alignments[i] for i in dict_score_alignments.keys()])
+    average_scores=np.mean(np.hstack(dict_score_alignments.values()))
+    print '---- average score = '+str(average_scores)
+    print '---- #pids (total) = '+str(len(dict_all_reads.keys()))
+
+
+    #5.count good and bad alignments
+    
+
+    #count_bad_scores = sum([sum([x<len(dict_ref_seq[true_seq_id])-DIST_MAX for x in reads]) for reads in dict_score_alignments])
+    #nb_bad_scores_reads = sum([sum([x<len(dict_ref_seq[true_seq_id])-DIST_MAX for x in reads]) for reads in dict_score_alignments])
+    #count_good_scores = sum([sum([x>=len(dict_ref_seq[true_seq_id])-DIST_MAX for x in reads]) for reads in dict_score_alignments])
+    #nb_good_scores_reads = sum([sum([x<len(dict_ref_seq[true_seq_id])-DIST_MAX for x in reads]) for reads in dict_score_alignments])
+
+                                                                         
+    #print '****'
+    #print '--RUN-barcode: '+str(true_seq_id)+': #bad align scores: '+str(count_bad_scores)
+    #print '--RUN-barcode: '+str(true_seq_id)+': #bad align scores (reads): '+str(nb_bad_scores_reads)
+    #print '--RUN-barcode: '+str(true_seq_id)+': #good align scores: '+str(count_good_scores)
+    #print '--RUN-barcode: '+str(true_seq_id)+': #good align scores (reads): '+str(nb_good_scores_reads)
+
+    # 6.check contamination
+    print '********'
+    print 'Check contamination'
+    dict_possible_good_align={}#contains the alignment scores of the pIDs cons. seq. for bc with the ref. seq. of the other barcodes
+    dict_reclassif={}
+    dict_count_reclassif_pIDs=defaultdict(int)
+    dict_count_reclassif_reads=defaultdict(int)
+    print 'RUN - barcode: '+str(true_seq_id)+':'
+    #index_bc = barcodes[prefix_date_and_id[RUN-1]].index(bc)#index of the barcode bc in the barcodes list
+    barcodes_to_test = [ref_seq_bc for ref_seq_bc in dict_ref_seq.keys() if (ref_seq_bc != true_seq_id)]
+
+    
+
+
+    for pID,align_scores in dict_score_alignments.iteritems():
+        if(max(align_scores) < len(dict_ref_seq[true_seq_id]) - DIST_MAX):
+            # if the alignment score of cons. seq. for pID with the reference sequence for bc is lower than the threshold
+            print '----found 1 bad alignment: '+pID+', '+str(dict_score_alignments[pID])+' < '+str(len(dict_ref_seq[true_seq_id])-DIST_MAX)
+            dict_possible_good_align[pID]=[]
+            dict_reclassif[pID]=[]
+            is_a_thrice_or_more = pID in dict_cons_seq.keys()
+            is_a_twice_with_distinct_reads = len(align_scores)==2
+            for ref_seq_id_2 in barcodes_to_test:# compare the alignment scores of cons. seq. with the ref. seq. for the other barcodes
+                if is_a_thrice_or_more:
+                    list_seq = [dict_cons_seq[pID][0][2]]
+                else:
+                    list_seq = [seq[2] for seq in dict_all_reads[pID]]
+                score_align_with_ref_seq_id_2 = max([lt.align_SW_dist(dict_ref_seq[ref_seq_id_2], lt.remove_gaps(seq))[2] for seq in list_seq])
+
+                dict_possible_good_align[pID].append((ref_seq_id_2,score_align_with_ref_seq_id_2))
+                if(score_align_with_ref_seq_id_2 >= len(dict_ref_seq[ref_seq_id_2])-DIST_MAX):
+                    print '------possible reclassif of pID '+pID+' in ref_seq_id_2 '+str(ref_seq_id_2)+', with align score: '+str(score_align_with_ref_seq_id_2)+' >= '+str(len(dict_ref_seq[ref_seq_id_2])-DIST_MAX)
+                    dict_count_reclassif_pIDs[ref_seq_id_2]+=1
+                    dict_count_reclassif_reads[ref_seq_id_2]+=sum([sum(map(int,np.array(dict_all_reads[pID])[:,i])) for i in [0,1]]) #dict_pIDs_occ[pID]
+                    dict_reclassif[pID].append((ref_seq_id_2,score_align_with_ref_seq_id_2))
+
+  
+
+    # 7.print contamination statistics
+    # 8.decontaminate the filtered reads file
+    dict_count_pIDs_to_bad_aligned_file=defaultdict(int)
+    dict_count_pIDs_from_contamination=defaultdict(int)
+    dict_count_pIDs_to_decontaminated_file=defaultdict(int)
+
+    with open(barcode_dir+'aligned_reads_'+readtype+'_decontaminated.fasta','w') as outfile_good, \
+            open(barcode_dir+'aligned_reads_'+readtype+'_bad_aligned.fasta','w') as outfile_bad:
+        sorted_pIDs = sorted(dict_all_reads.keys())
+        for pID in sorted_pIDs:
+            if pID in dict_reclassif:
+                read_count=0
+                for read in dict_all_reads[pID]:
+                    outfile_bad.write('>'+lt.read_label(pID, read[0], read[1])+'\n')
+                    outfile_bad.write(read[2]+'\n')
+                    read_count+=read[0] + read[1]
+                if len(dict_reclassif[pID])>0:
+                    dict_count_pIDs_from_contamination[pID] = read_count
+                else:
+                    dict_count_pIDs_to_bad_aligned_file[pID] = read_count
+            else:
+                read_count=0
+                for read in dict_all_reads[pID]:
+                    outfile_good.write('>'+lt.read_label(pID, read[0], read[1])+'\n')
+                    outfile_good.write(read[2]+'\n')                
+                    read_count+=read[0] + read[1]
+                dict_count_pIDs_to_decontaminated_file[pID]+=read_count
+         print '-- # good reads written: '+str(sum(dict_count_pIDs_to_decontaminated_file.values()))
+         print '-- # contaminant reads written: '+str(sum(dict_count_pIDs_from_contamination.values()))
+         print '-- # reads of unknown origin written: '+str(sum(dict_count_pIDs_to_bad_aligned_file.values()))
+
+    # write the consensus sequences file
+    with open(barcode_dir+'consensus_sequences_'+readtype+'_decontaminated.fasta','w') as outfile:
+        print 'write decontaminated file for run-barcode: '+str(true_seq_id)
+        consensus_pIDs = sorted(dict_cons_seq.keys())
+        for pID in consensus_pIDs:
+            if pID not in dict_reclassif:
+                rec = dict_cons_seq[pID][0]
+                outfile.write('>'+lt.read_label(pID, rec[0], rec[1])+'\n')
+                outfile.write(rec[0][2]+'\n')
+
+
+    # 9. print decontamination statistics
+    print '#############'
+    print '-RUN-barcode '+str(true_seq_id)
+    print '---> nb non contaminant pIDs: '+str(len(dict_count_pIDs_to_decontaminated_file.keys()))
+    print '---> nb non contaminant reads: '+str(sum(dict_count_pIDs_to_decontaminated_file.values()))
+    print '--'
+    print '---> nb contaminant pIDs: '+str(len(dict_count_pIDs_from_contamination.keys()))
+    print '---> nb contaminant reads: '+str(sum(dict_count_pIDs_from_contamination.values()))
+    print '--'
+    print '---> nb bad aligned but non contaminant pIDs : '+str(len(dict_count_pIDs_to_bad_aligned_file.keys()))
+    print '---> nb bad aligned but non contaminant reads: '+str(sum(dict_count_pIDs_to_bad_aligned_file.values()))
+
+
+else:
+    #print auto_file_name+': usage: '+auto_file_name+' <filtered reads file (in ../data/)> <consensus seq. file (in ../templates/<date-and-id>_consensus/)> <ref. seq. file (in ../data/)> <run_number>'
+    print auto_file_name+': usage: '+auto_file_name+' <barcode_dir> <ref_seq_file_name> read_type true_seq_id'
+
+
+    
diff --git a/src/p6_align_global_per_barcode.py b/src/p6_align_global_per_barcode.py
deleted file mode 100755 (executable)
index 8a47201..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/python
-
-
-# Script that aligns, for a given barcode, all the consensus sequences (each one corresponding to a pID)
-# Input: consensus sequences file for a given barcode (in ../templates/dir-<date_and_id>_consensus/)
-# Output: aligned consensus sequences file for a given barcode (in ../templates/dir-<date_and_id>_align-global/)
-
-
-import numpy as np
-from Bio import SeqIO
-from Bio.Align.Applications import MuscleCommandline
-import glob
-import sys
-import time
-import lib_tools as lt
-
-
-auto_file_name = str(sys.argv[0])
-
-######
-# DEF FUNCTIONS
-######
-
-
-if(len(sys.argv)==3):
-    # parse the input consensus sequences file name
-    rundir=str(sys.argv[1]).rstrip('/')+'/'
-    read_type = '_'+sys.argv[2]
-
-    barcode_directories = glob.glob(rundir+'bc_*_analysis*')
-    for dname in barcode_directories:
-        consensus_file_list = glob.glob(dname+'/consensus*'+read_type+'.fasta')
-        time_start = time.time()
-        for fname in consensus_file_list:
-            aligned_fname = lt.trim_extension(fname)+'_aligned.fasta'
-
-            try:
-                cline = MuscleCommandline(input = fname, out = aligned_fname)
-                cline()
-            except:
-                print "Trouble aligning", fname
-            consensus_seqs, counts_good, counts_bad = lt.parse_readfile(aligned_fname)
-            with open(aligned_fname, 'w') as outfile:
-                sorted_pIDs = sorted(consensus_seqs.keys())
-                for pID in sorted_pIDs:
-                    for fwd, rev, seq in consensus_seqs[pID]:
-                        outfile.write('>'+lt.read_label(pID, fwd, rev)+'\n')
-                        outfile.write(seq+'\n')
-                                      
-
-        time_end = time.time()
-        print 'alignment computation time: ' + str(time_end - time_start)
-
-else:
-    print auto_file_name + ': usage: '+ auto_file_name + ' <run director> <read type>'
diff --git a/src/p6_correct.all b/src/p6_correct.all
new file mode 100644 (file)
index 0000000..179c6b6
--- /dev/null
@@ -0,0 +1,28 @@
+#!/usr/bin/python
+
+import os
+import sys
+import time
+import glob 
+
+####
+# retrieve the temp reads file names to align in the specific cluster directory (in ../templates/) given in input
+# the given cluster directory path must be relative to the current directory (src)
+####
+if(len(sys.argv)==3):
+    run_dir = str(sys.argv[1])
+    run_dir = run_dir.rstrip('/')+'/'
+    read_type = sys.argv[2]
+
+    barcode_dir_list = glob.glob(run_dir+'bc_*_analysis')
+    for analysis_dir in barcode_dir_list:
+        #collect the list of temp reads files to align
+        cmd = 'qsub -cwd -l h_rt=00:59:00 -l h_vmem=10G ./src/p8_detect_mutants_indels.py '+analysis_dir+ ' ' +read_type
+        print cmd
+        os.system(cmd)
+
+else:
+    print sys.argv[1]+': usage: '+sys.argv[0]+' <directory of a run> <read_type>'
+
+
+
diff --git a/src/p6_detect_mutants_indels.py b/src/p6_detect_mutants_indels.py
new file mode 100755 (executable)
index 0000000..c7346b0
--- /dev/null
@@ -0,0 +1,209 @@
+#!/usr/bin/python
+
+####
+# Script that detects the possible mutants (pID mutations) with indels and generates a new filtered reads file (in ../data) resolving the pID mutations 
+# 
+# input: - filtered reads file (in ../data/)
+#        - consensus sequences file (in ../templates/dir-<date_and_id>_consensus/)
+# 
+# output: filtered reads file (in ../data/) resolving the pID mutations, new <date_and_id> = <date_and_id>-mutant-indels
+#         new read id: <pID-parent>_<nb-reads-fwd>_<nb-reads-rev>_<pID-origin>  
+####
+
+import numpy as np
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.Alphabet import generic_dna
+from collections import Counter
+from collections import defaultdict
+from Bio.pairwise2 import align
+from itertools import imap
+import operator
+import sys
+sys.path.append('./src')
+import datetime
+import time
+import lib_tools as lt
+
+auto_file_name = str(sys.argv[0])
+
+
+# distance max allowed for the consensus seq. alignments
+DIST_MAX = 3
+
+# minimum #times a pID must occur to have neighbors 
+MIN_OCC_MOST_ABUND_PID = 10
+
+max_pid_alignment_dist = 1.6
+# anything <1 does not allow any substitions
+# 1 allows only one point mutation
+# 1.6 allows one indel and one substitution
+# alignment parameters: match=1, unmatch=0, opening gap penalty=-0.6, gap extension penalty=-0.3
+
+
+########
+########
+
+
+
+if (len(sys.argv) == 3):
+    # parse the input data files names
+    barcode_dir = str(sys.argv[1]).rstrip('/')+'/'
+    readtype = sys.argv[2]
+      
+    # Create the dictionary of pIDs (from the output files of p1_trim_and_filter)
+
+    ###################################
+    # -- READ THE FILTERED READS FILE #
+    ###################################
+    # dictionary containing the list of reads (values) associated to a pID (keys)
+    dict_all_reads, count_reads_added_to_dict_all_reads, count_reads_with_invalid_pID = lt.parse_readfile(barcode_dir+'aligned_reads_'+readtype+'.fasta')
+    # control prints
+    ####
+    print 'count_reads_with_invalid_pID: '+str(count_reads_with_invalid_pID)
+    print 'count_reads_added_to_dict_all_reads: '+str(count_reads_added_to_dict_all_reads)
+    print '#pIDs different: '+str(len(dict_all_reads.keys()))
+
+
+    pID_abundance = {}
+
+    for pID,reads in dict_all_reads.iteritems():
+        pID_abundance[pID] = sum([x[0]+x[1] for x in reads])
+
+    print '#pIDs occurring only once: ' + str(sum([x==1 for x in pID_abundance.values()]))
+    print '#pIDs occurring only twice: ' + str(sum([x==2 for x in pID_abundance.values()]))
+    print '#pIDs occurring at least 3 times: ' + str(sum([x>2 for x in pID_abundance.values()]))
+    ####
+
+
+    ########################################
+    # -- READ THE CONSENSUS SEQUENCES FILE #
+    ########################################
+    # dictionary containing the consensus sequence (values) associated to a pID (keys) if it occurs at least 3 times
+    dict_cons_seq, a, b = lt.parse_readfile(barcode_dir+'consensus_sequences_'+readtype+'.fasta')
+    print 'From consensus seq. file :'
+    print '#pIDs occurring at least 3 times: ' + str(len(dict_cons_seq.keys()))
+
+
+    ###########################
+    # -- SEARCH THE NEIGHBORS #
+    ###########################
+    # dictionary indicating the "neighbor" state of each pID
+    dict_neighbor_state_pIDs = {}
+    for pID in dict_all_reads.keys():
+        dict_neighbor_state_pIDs[pID] = False
+
+    list_occ_pIDs_dec = sorted(pID_abundance.items(), key = lambda x:x[1], reverse=True)
+    list_occ_pIDs_inc = list_occ_pIDs_dec[::-1]
+
+    print 'Check list_occ_pIDs_dec and list_occ_pIDs_inc lengths (should be equal): '        
+    print len(list_occ_pIDs_dec), len(list_occ_pIDs_inc)
+    #print list_occ_pIDs_dec
+    #print list_occ_pIDs_inc
+    # dictionary containing the parent pID (values) of a given pID (keys)
+    dict_neighbor_of = defaultdict(list)
+    dict_neighbors = defaultdict(list)
+
+    #lim_pID_a = [list_occ_pIDs_inc[i][0]>=MIN_OCC_MOST_ABUND_PID for i in range(len(list_occ_pIDs_inc))].count(True)
+    lim_pID_a = [i[1]>=MIN_OCC_MOST_ABUND_PID for i in list_occ_pIDs_inc].count(True)
+    print 'lim_pID_a:'+str(lim_pID_a)
+    print 'nb occ pID_a at pos lim_pID_a-1 in list_occ_pIDs_dec: '+str(list_occ_pIDs_dec[lim_pID_a-1][1])
+    print 'nb occ pID_a at pos lim_pID_a in list_occ_pIDs_dec: '+str(list_occ_pIDs_dec[lim_pID_a][1])
+
+    for ii_a, pID_a in enumerate(list_occ_pIDs_dec[:lim_pID_a]):
+        # this condition should always evaluate to false
+        if dict_neighbor_state_pIDs[pID_a[0]] == True:
+            continue
+        # loop over rare pid
+        for ii,pID_r in enumerate(list_occ_pIDs_inc[:-(ii_a+1)]):
+            # check whether this PID is already a neighbor of some other
+             if((dict_neighbor_state_pIDs[pID_r[0]] == False) and (pID_r[1] < pID_a[1])):
+                pIDs_align_score = align.localms(pID_a[0], pID_r[0], 1, 0, -0.6, -0.3)
+                if (len(pIDs_align_score) != 0): # if pID_a[1] and pID_r[1] can be aligned
+                    if (pIDs_align_score[0][2] >= len(pID_a[0]) - max_pid_alignment_dist):
+                        print 'pID_a: '+str(pID_a)+', pID_r: '+str(pID_r)
+                        print pIDs_align_score[0][0], pIDs_align_score[0][1]
+                        print dict_neighbor_state_pIDs[pID_a[0]], dict_neighbor_state_pIDs[pID_r[0]]
+                        print "possible neighbors"
+
+                        if pID_r[1]>2: 
+                            neighbor_candidates = dict_cons_seq[pID_r[0]]
+                        else:
+                            neighbor_candidates = dict_all_reads[pID_r[0]]
+
+
+                        seq_a = dict_cons_seq[pID_a[0]][0][2] 
+                        #if (pID_r[0] != 2 or (pID_r[0] == 2 and len(dict_all_reads[pID_r[1]])==1)): # pID_r occurs once or more than twice, or twice with identical reads: only one sequence comparison to do
+                            #if(pID_r[0] <= 2): # pID_r occurs once or twice (with identical reads): its sequence is in dict_all_reads
+                                # seq_r = dict_all_reads[pID_r[0]][0][2]
+                                #seq_r = dict_all_reads[pID_r[1]][0][2] # [nb_fwd,nb_rev,seq]
+                            #else: # pID_r occurs at least 3 times: its consensus sequence is in dict_cons_seq
+                                #seq_r = dict_cons_seq[pID_r[1]][0][2] # [nb_fwd,nb_rev,seq]
+                        #else: # pID_r occurs twice with distinct reads: if one read aligns well: pID_r is considered as a mutant
+                            #[seq_r_0,seq_r_1] = [dict_all_reads[pID_r[1]][i][2] for i in [0,1]]
+                            
+                        #if(pID_)
+
+                        for nf, nr, seq_r in neighbor_candidates:
+                            if lt.check_neighbor_plausibility(seq_a, seq_r, DIST_MAX, verbose=False):
+                                dict_neighbors[pID_a[0]].append(pID_r[0])
+                                dict_neighbor_of[pID_r[0]] = pID_a[0]
+                                dict_neighbor_state_pIDs[pID_r[0]] = True
+                                print "NEIGHBORS !"
+                                break
+
+
+    print 'neighbors:'
+    nb_neighbors_stated = 0
+    for pID,state in dict_neighbor_state_pIDs.iteritems():
+        if state:
+            nb_neighbors_stated += 1
+            print pID
+
+
+
+    ##############################################
+    # -- WRITE THE NEIGHBORS FILTERED READS FILE #
+    ##############################################
+    count_reads_written = 0
+    count_neighbors_reads_written = 0
+
+    # write the new filtered reads file (mutant-indels):
+    corrected_aligned_reads_fname = barcode_dir+'corrected_reads.fasta'
+    with open(corrected_aligned_reads_fname, 'w') as neighbors_filtered_reads_file:
+        for pID in dict_all_reads.keys(): # for all pIDs                                
+            if not dict_neighbor_state_pIDs[pID]: # if pID "is not a neighbor"
+                print 'write reads for pID: ' + pID
+
+                # write the reads corresponding to pID
+                for cur_read in dict_all_reads[pID]: # cur_read is a list [nb_occ_fwd,nb_occ_rev,seq]
+                    #pid_orig = pID
+                    #mut_flag = ''
+                    new_read_id = str(pID+'_'+str(cur_read[0])+'_'+str(cur_read[1])+'_'+pID)
+                    neighbors_filtered_reads_file.write('>'+new_read_id+'\n') # write the new id of the read
+                    neighbors_filtered_reads_file.write(cur_read[2]+'\n') # write the sequence
+                    count_reads_written += (cur_read[0]+cur_read[1])
+
+                # write the neighbors if there are
+                if (len(dict_neighbors[pID]) > 0):
+                    print '#neighbors pIDs: ' + str(len(dict_neighbors[pID]))
+                    for pID_n in dict_neighbors[pID]: # for all the neighbors pID_n of pID
+
+                        # write the neighbors pID_n
+                        for read_n in dict_all_reads[pID_n]: # read_n is a list [nb_occ_fwd,nb_occ_rev,seq]
+                            pid_orig = pID_n
+                            mut_flag = '_m'
+                            new_read_id = str(pID+'_'+str(read_n[0])+'_'+str(read_n[1])+'_'+pid_orig+mut_flag)
+                            neighbors_filtered_reads_file.write('>'+new_read_id+'\n') # write the new id of the read
+                            neighbors_filtered_reads_file.write(read_n[2]+'\n') # write the sequence
+                            count_reads_written += (read_n[0]+read_n[1])
+                            count_neighbors_reads_written += (read_n[0]+read_n[1])
+
+
+    print 'nb reads written: '+str(count_reads_written)
+    print 'nb neighbored reads written: '+str(count_neighbors_reads_written)
+    print 'nb neighbored pIDs: '+str(nb_neighbors_stated)
+
+
+else:
+    print auto_file_name + ': usage: '+ auto_file_name + ' <directory with reads from a barcode> <type of read>'
diff --git a/src/p7_decontaminate_all.py b/src/p7_decontaminate_all.py
deleted file mode 100644 (file)
index b959b07..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/python
-
-import os
-import sys
-import time
-import glob 
-
-####
-# retrieve the temp reads file names to align in the specific cluster directory (in ../templates/) given in input
-# the given cluster directory path must be relative to the current directory (src)
-####
-if(len(sys.argv)==2):
-    run_dir = str(sys.argv[1])
-    run_dir = run_dir.rstrip('/')+'/'
-
-    run  = run_dir[-2]
-    barcode_dir_list = glob.glob(run_dir+'bc_*_analysis')
-    for analysis_dir in barcode_dir_list:
-        barcode = analysis_dir.split('_')[-2]
-        #collect the list of temp reads files to align
-        cmd = 'qsub -cwd -l h_rt=00:59:00 -l h_vmem=10G ./src/p7_decontamination.py '+analysis_dir+' ./data/ref_seq_for_decontamination.fasta filtered '+'"'+run+' '+barcode+'"'
-        print cmd
-        os.system(cmd)
-
-else:
-    print sys.argv[1]+': usage: '+auto_file_name+' <directory of a run>'
-
-
-
diff --git a/src/p7_decontamination.py b/src/p7_decontamination.py
deleted file mode 100755 (executable)
index 1d123c6..0000000
+++ /dev/null
@@ -1,193 +0,0 @@
-#!/usr/bin/python
-#/!\#
-#### /ebio/ag-neher/share/pograms/EPD/bins/python
-#/!\#
-
-# script that decontaminates a filtered reads file for a given barcode and run id
-
-import numpy as np
-from Bio import AlignIO
-from Bio import SeqIO
-from Bio.Seq import Seq
-from Bio.Align import MultipleSeqAlignment
-from collections import Counter
-from collections import defaultdict
-import sys
-import datetime
-import time
-sys.path.append('./src/')
-import lib_tools as lt
-
-
-auto_file_name=str(sys.argv[0])
-
-DIST_MAX = 7
-
-if (len(sys.argv)==5):
-
-    # parse the input file names
-    barcode_dir = str(sys.argv[1]).rstrip('/')+'/'
-    ref_seq_file_name = str(sys.argv[2])
-    readtype = sys.argv[3]
-    true_seq_id = sys.argv[4]
-
-    # parse the reference sequences file in dict_ref_seq
-
-    dict_ref_seq ={}
-    with open(ref_seq_file_name, 'r') as infile:
-        for seq in SeqIO.parse(infile, 'fasta'):
-            dict_ref_seq[seq.description]=str(seq.seq)
-
-
-    #2.parse the consensus sequences files for the given prefix_date_and_id in dict_cons_seq
-    dict_cons_seq, a,b = lt.parse_readfile(barcode_dir+'consensus_sequences_'+readtype+'.fasta')
-    
-    
-    #/!\ : aligned reads: need to trim the gaps at one point
-    dict_all_reads, nreads, nbadreads = lt.parse_readfile(barcode_dir+'aligned_reads_'+readtype+'.fasta')
-
-    dict_score_alignments = defaultdict(list)
-    for pID in dict_all_reads:
-        if pID in dict_cons_seq:
-            #print 'FLAG:'
-            #print dict_cons_seq[pID]
-            tmp_cons_seq = dict_cons_seq[pID][0][2]
-            score_align = lt.align_SW_dist(dict_ref_seq[true_seq_id], tmp_cons_seq)[2]
-            print '----'+str(true_seq_id)+': '+pID+', align score: '+str(score_align)
-            dict_score_alignments[pID].append(score_align)
-        else:
-            for nf, nb, seq in dict_all_reads[pID]:
-                score_align = lt.align_SW_dist(dict_ref_seq[true_seq_id], lt.remove_gaps(seq))[2]
-                print '----'+str(true_seq_id)+': '+pID+', align score: '+str(score_align)
-                dict_score_alignments[pID].append(score_align)
-
-
-    #4.print some statistics
-    print '** LIM SCORE ALIGN-'+true_seq_id
-    #print '---- min = '+str(min([dict_score_alignments[i] for i in dict_score_alignments.keys()]))
-    print '---- min = '+str(min(np.hstack(dict_score_alignments.values())))
-    #print '---- max = '+str(max([dict_score_alignments[i] for i in dict_score_alignments.keys()]))
-    print '---- min = '+str(max(np.hstack(dict_score_alignments.values())))
-    #average_scores=np.mean([dict_score_alignments[i] for i in dict_score_alignments.keys()])
-    average_scores=np.mean(np.hstack(dict_score_alignments.values()))
-    print '---- average score = '+str(average_scores)
-    print '---- #pids (total) = '+str(len(dict_all_reads.keys()))
-
-
-    #5.count good and bad alignments
-    
-
-    #count_bad_scores = sum([sum([x<len(dict_ref_seq[true_seq_id])-DIST_MAX for x in reads]) for reads in dict_score_alignments])
-    #nb_bad_scores_reads = sum([sum([x<len(dict_ref_seq[true_seq_id])-DIST_MAX for x in reads]) for reads in dict_score_alignments])
-    #count_good_scores = sum([sum([x>=len(dict_ref_seq[true_seq_id])-DIST_MAX for x in reads]) for reads in dict_score_alignments])
-    #nb_good_scores_reads = sum([sum([x<len(dict_ref_seq[true_seq_id])-DIST_MAX for x in reads]) for reads in dict_score_alignments])
-
-                                                                         
-    #print '****'
-    #print '--RUN-barcode: '+str(true_seq_id)+': #bad align scores: '+str(count_bad_scores)
-    #print '--RUN-barcode: '+str(true_seq_id)+': #bad align scores (reads): '+str(nb_bad_scores_reads)
-    #print '--RUN-barcode: '+str(true_seq_id)+': #good align scores: '+str(count_good_scores)
-    #print '--RUN-barcode: '+str(true_seq_id)+': #good align scores (reads): '+str(nb_good_scores_reads)
-
-    # 6.check contamination
-    print '********'
-    print 'Check contamination'
-    dict_possible_good_align={}#contains the alignment scores of the pIDs cons. seq. for bc with the ref. seq. of the other barcodes
-    dict_reclassif={}
-    dict_count_reclassif_pIDs=defaultdict(int)
-    dict_count_reclassif_reads=defaultdict(int)
-    print 'RUN - barcode: '+str(true_seq_id)+':'
-    #index_bc = barcodes[prefix_date_and_id[RUN-1]].index(bc)#index of the barcode bc in the barcodes list
-    barcodes_to_test = [ref_seq_bc for ref_seq_bc in dict_ref_seq.keys() if (ref_seq_bc != true_seq_id)]
-
-    
-
-
-    for pID,align_scores in dict_score_alignments.iteritems():
-        if(max(align_scores) < len(dict_ref_seq[true_seq_id]) - DIST_MAX):
-            # if the alignment score of cons. seq. for pID with the reference sequence for bc is lower than the threshold
-            print '----found 1 bad alignment: '+pID+', '+str(dict_score_alignments[pID])+' < '+str(len(dict_ref_seq[true_seq_id])-DIST_MAX)
-            dict_possible_good_align[pID]=[]
-            dict_reclassif[pID]=[]
-            is_a_thrice_or_more = pID in dict_cons_seq.keys()
-            is_a_twice_with_distinct_reads = len(align_scores)==2
-            for ref_seq_id_2 in barcodes_to_test:# compare the alignment scores of cons. seq. with the ref. seq. for the other barcodes
-                if is_a_thrice_or_more:
-                    list_seq = [dict_cons_seq[pID][0][2]]
-                else:
-                    list_seq = [seq[2] for seq in dict_all_reads[pID]]
-                score_align_with_ref_seq_id_2 = max([lt.align_SW_dist(dict_ref_seq[ref_seq_id_2], lt.remove_gaps(seq))[2] for seq in list_seq])
-
-                dict_possible_good_align[pID].append((ref_seq_id_2,score_align_with_ref_seq_id_2))
-                if(score_align_with_ref_seq_id_2 >= len(dict_ref_seq[ref_seq_id_2])-DIST_MAX):
-                    print '------possible reclassif of pID '+pID+' in ref_seq_id_2 '+str(ref_seq_id_2)+', with align score: '+str(score_align_with_ref_seq_id_2)+' >= '+str(len(dict_ref_seq[ref_seq_id_2])-DIST_MAX)
-                    dict_count_reclassif_pIDs[ref_seq_id_2]+=1
-                    dict_count_reclassif_reads[ref_seq_id_2]+=sum([sum(map(int,np.array(dict_all_reads[pID])[:,i])) for i in [0,1]]) #dict_pIDs_occ[pID]
-                    dict_reclassif[pID].append((ref_seq_id_2,score_align_with_ref_seq_id_2))
-
-  
-
-    # 7.print contamination statistics
-    # 8.decontaminate the filtered reads file
-    dict_count_pIDs_to_bad_aligned_file=defaultdict(int)
-    dict_count_pIDs_from_contamination=defaultdict(int)
-    dict_count_pIDs_to_decontaminated_file=defaultdict(int)
-
-    with open(barcode_dir+'aligned_reads_'+readtype+'_decontaminated.fasta','w') as outfile_good, \
-            open(barcode_dir+'aligned_reads_'+readtype+'_bad_aligned.fasta','w') as outfile_bad:
-        sorted_pIDs = sorted(dict_all_reads.keys())
-        for pID in sorted_pIDs:
-            if pID in dict_reclassif:
-                read_count=0
-                for read in dict_all_reads[pID]:
-                    outfile_bad.write('>'+lt.read_label(pID, read[0], read[1])+'\n')
-                    outfile_bad.write(read[2]+'\n')
-                    read_count+=read[0] + read[1]
-                if len(dict_reclassif[pID])>0:
-                    dict_count_pIDs_from_contamination[pID] = read_count
-                else:
-                    dict_count_pIDs_to_bad_aligned_file[pID] = read_count
-            else:
-                read_count=0
-                for read in dict_all_reads[pID]:
-                    outfile_good.write('>'+lt.read_label(pID, read[0], read[1])+'\n')
-                    outfile_good.write(read[2]+'\n')                
-                    read_count+=read[0] + read[1]
-                dict_count_pIDs_to_decontaminated_file[pID]+=read_count
-         print '-- # good reads written: '+str(sum(dict_count_pIDs_to_decontaminated_file.values()))
-         print '-- # contaminant reads written: '+str(sum(dict_count_pIDs_from_contamination.values()))
-         print '-- # reads of unknown origin written: '+str(sum(dict_count_pIDs_to_bad_aligned_file.values()))
-
-    # write the consensus sequences file
-    with open(barcode_dir+'consensus_sequences_'+readtype+'_decontaminated.fasta','w') as outfile:
-        print 'write decontaminated file for run-barcode: '+str(true_seq_id)
-        consensus_pIDs = sorted(dict_cons_seq.keys())
-        for pID in consensus_pIDs:
-            if pID not in dict_reclassif:
-                rec = dict_cons_seq[pID][0]
-                outfile.write('>'+lt.read_label(pID, rec[0], rec[1])+'\n')
-                outfile.write(rec[0][2]+'\n')
-
-
-    # 9. print decontamination statistics
-    print '#############'
-    print '-RUN-barcode '+str(true_seq_id)
-    print '---> nb non contaminant pIDs: '+str(len(dict_count_pIDs_to_decontaminated_file.keys()))
-    print '---> nb non contaminant reads: '+str(sum(dict_count_pIDs_to_decontaminated_file.values()))
-    print '--'
-    print '---> nb contaminant pIDs: '+str(len(dict_count_pIDs_from_contamination.keys()))
-    print '---> nb contaminant reads: '+str(sum(dict_count_pIDs_from_contamination.values()))
-    print '--'
-    print '---> nb bad aligned but non contaminant pIDs : '+str(len(dict_count_pIDs_to_bad_aligned_file.keys()))
-    print '---> nb bad aligned but non contaminant reads: '+str(sum(dict_count_pIDs_to_bad_aligned_file.values()))
-
-
-else:
-    #print auto_file_name+': usage: '+auto_file_name+' <filtered reads file (in ../data/)> <consensus seq. file (in ../templates/<date-and-id>_consensus/)> <ref. seq. file (in ../data/)> <run_number>'
-    print auto_file_name+': usage: '+auto_file_name+' <barcode_dir> <ref_seq_file_name> read_type true_seq_id'
-
-
-    
diff --git a/src/p8_correct_all.py b/src/p8_correct_all.py
deleted file mode 100644 (file)
index 179c6b6..0000000
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/python
-
-import os
-import sys
-import time
-import glob 
-
-####
-# retrieve the temp reads file names to align in the specific cluster directory (in ../templates/) given in input
-# the given cluster directory path must be relative to the current directory (src)
-####
-if(len(sys.argv)==3):
-    run_dir = str(sys.argv[1])
-    run_dir = run_dir.rstrip('/')+'/'
-    read_type = sys.argv[2]
-
-    barcode_dir_list = glob.glob(run_dir+'bc_*_analysis')
-    for analysis_dir in barcode_dir_list:
-        #collect the list of temp reads files to align
-        cmd = 'qsub -cwd -l h_rt=00:59:00 -l h_vmem=10G ./src/p8_detect_mutants_indels.py '+analysis_dir+ ' ' +read_type
-        print cmd
-        os.system(cmd)
-
-else:
-    print sys.argv[1]+': usage: '+sys.argv[0]+' <directory of a run> <read_type>'
-
-
-
diff --git a/src/p8_detect_mutants_indels.py b/src/p8_detect_mutants_indels.py
deleted file mode 100755 (executable)
index c7346b0..0000000
+++ /dev/null
@@ -1,209 +0,0 @@
-#!/usr/bin/python
-
-####
-# Script that detects the possible mutants (pID mutations) with indels and generates a new filtered reads file (in ../data) resolving the pID mutations 
-# 
-# input: - filtered reads file (in ../data/)
-#        - consensus sequences file (in ../templates/dir-<date_and_id>_consensus/)
-# 
-# output: filtered reads file (in ../data/) resolving the pID mutations, new <date_and_id> = <date_and_id>-mutant-indels
-#         new read id: <pID-parent>_<nb-reads-fwd>_<nb-reads-rev>_<pID-origin>  
-####
-
-import numpy as np
-from Bio import SeqIO
-from Bio.Seq import Seq
-from Bio.Alphabet import generic_dna
-from collections import Counter
-from collections import defaultdict
-from Bio.pairwise2 import align
-from itertools import imap
-import operator
-import sys
-sys.path.append('./src')
-import datetime
-import time
-import lib_tools as lt
-
-auto_file_name = str(sys.argv[0])
-
-
-# distance max allowed for the consensus seq. alignments
-DIST_MAX = 3
-
-# minimum #times a pID must occur to have neighbors 
-MIN_OCC_MOST_ABUND_PID = 10
-
-max_pid_alignment_dist = 1.6
-# anything <1 does not allow any substitions
-# 1 allows only one point mutation
-# 1.6 allows one indel and one substitution
-# alignment parameters: match=1, unmatch=0, opening gap penalty=-0.6, gap extension penalty=-0.3
-
-
-########
-########
-
-
-
-if (len(sys.argv) == 3):
-    # parse the input data files names
-    barcode_dir = str(sys.argv[1]).rstrip('/')+'/'
-    readtype = sys.argv[2]
-      
-    # Create the dictionary of pIDs (from the output files of p1_trim_and_filter)
-
-    ###################################
-    # -- READ THE FILTERED READS FILE #
-    ###################################
-    # dictionary containing the list of reads (values) associated to a pID (keys)
-    dict_all_reads, count_reads_added_to_dict_all_reads, count_reads_with_invalid_pID = lt.parse_readfile(barcode_dir+'aligned_reads_'+readtype+'.fasta')
-    # control prints
-    ####
-    print 'count_reads_with_invalid_pID: '+str(count_reads_with_invalid_pID)
-    print 'count_reads_added_to_dict_all_reads: '+str(count_reads_added_to_dict_all_reads)
-    print '#pIDs different: '+str(len(dict_all_reads.keys()))
-
-
-    pID_abundance = {}
-
-    for pID,reads in dict_all_reads.iteritems():
-        pID_abundance[pID] = sum([x[0]+x[1] for x in reads])
-
-    print '#pIDs occurring only once: ' + str(sum([x==1 for x in pID_abundance.values()]))
-    print '#pIDs occurring only twice: ' + str(sum([x==2 for x in pID_abundance.values()]))
-    print '#pIDs occurring at least 3 times: ' + str(sum([x>2 for x in pID_abundance.values()]))
-    ####
-
-
-    ########################################
-    # -- READ THE CONSENSUS SEQUENCES FILE #
-    ########################################
-    # dictionary containing the consensus sequence (values) associated to a pID (keys) if it occurs at least 3 times
-    dict_cons_seq, a, b = lt.parse_readfile(barcode_dir+'consensus_sequences_'+readtype+'.fasta')
-    print 'From consensus seq. file :'
-    print '#pIDs occurring at least 3 times: ' + str(len(dict_cons_seq.keys()))
-
-
-    ###########################
-    # -- SEARCH THE NEIGHBORS #
-    ###########################
-    # dictionary indicating the "neighbor" state of each pID
-    dict_neighbor_state_pIDs = {}
-    for pID in dict_all_reads.keys():
-        dict_neighbor_state_pIDs[pID] = False
-
-    list_occ_pIDs_dec = sorted(pID_abundance.items(), key = lambda x:x[1], reverse=True)
-    list_occ_pIDs_inc = list_occ_pIDs_dec[::-1]
-
-    print 'Check list_occ_pIDs_dec and list_occ_pIDs_inc lengths (should be equal): '        
-    print len(list_occ_pIDs_dec), len(list_occ_pIDs_inc)
-    #print list_occ_pIDs_dec
-    #print list_occ_pIDs_inc
-    # dictionary containing the parent pID (values) of a given pID (keys)
-    dict_neighbor_of = defaultdict(list)
-    dict_neighbors = defaultdict(list)
-
-    #lim_pID_a = [list_occ_pIDs_inc[i][0]>=MIN_OCC_MOST_ABUND_PID for i in range(len(list_occ_pIDs_inc))].count(True)
-    lim_pID_a = [i[1]>=MIN_OCC_MOST_ABUND_PID for i in list_occ_pIDs_inc].count(True)
-    print 'lim_pID_a:'+str(lim_pID_a)
-    print 'nb occ pID_a at pos lim_pID_a-1 in list_occ_pIDs_dec: '+str(list_occ_pIDs_dec[lim_pID_a-1][1])
-    print 'nb occ pID_a at pos lim_pID_a in list_occ_pIDs_dec: '+str(list_occ_pIDs_dec[lim_pID_a][1])
-
-    for ii_a, pID_a in enumerate(list_occ_pIDs_dec[:lim_pID_a]):
-        # this condition should always evaluate to false
-        if dict_neighbor_state_pIDs[pID_a[0]] == True:
-            continue
-        # loop over rare pid
-        for ii,pID_r in enumerate(list_occ_pIDs_inc[:-(ii_a+1)]):
-            # check whether this PID is already a neighbor of some other
-             if((dict_neighbor_state_pIDs[pID_r[0]] == False) and (pID_r[1] < pID_a[1])):
-                pIDs_align_score = align.localms(pID_a[0], pID_r[0], 1, 0, -0.6, -0.3)
-                if (len(pIDs_align_score) != 0): # if pID_a[1] and pID_r[1] can be aligned
-                    if (pIDs_align_score[0][2] >= len(pID_a[0]) - max_pid_alignment_dist):
-                        print 'pID_a: '+str(pID_a)+', pID_r: '+str(pID_r)
-                        print pIDs_align_score[0][0], pIDs_align_score[0][1]
-                        print dict_neighbor_state_pIDs[pID_a[0]], dict_neighbor_state_pIDs[pID_r[0]]
-                        print "possible neighbors"
-
-                        if pID_r[1]>2: 
-                            neighbor_candidates = dict_cons_seq[pID_r[0]]
-                        else:
-                            neighbor_candidates = dict_all_reads[pID_r[0]]
-
-
-                        seq_a = dict_cons_seq[pID_a[0]][0][2] 
-                        #if (pID_r[0] != 2 or (pID_r[0] == 2 and len(dict_all_reads[pID_r[1]])==1)): # pID_r occurs once or more than twice, or twice with identical reads: only one sequence comparison to do
-                            #if(pID_r[0] <= 2): # pID_r occurs once or twice (with identical reads): its sequence is in dict_all_reads
-                                # seq_r = dict_all_reads[pID_r[0]][0][2]
-                                #seq_r = dict_all_reads[pID_r[1]][0][2] # [nb_fwd,nb_rev,seq]
-                            #else: # pID_r occurs at least 3 times: its consensus sequence is in dict_cons_seq
-                                #seq_r = dict_cons_seq[pID_r[1]][0][2] # [nb_fwd,nb_rev,seq]
-                        #else: # pID_r occurs twice with distinct reads: if one read aligns well: pID_r is considered as a mutant
-                            #[seq_r_0,seq_r_1] = [dict_all_reads[pID_r[1]][i][2] for i in [0,1]]
-                            
-                        #if(pID_)
-
-                        for nf, nr, seq_r in neighbor_candidates:
-                            if lt.check_neighbor_plausibility(seq_a, seq_r, DIST_MAX, verbose=False):
-                                dict_neighbors[pID_a[0]].append(pID_r[0])
-                                dict_neighbor_of[pID_r[0]] = pID_a[0]
-                                dict_neighbor_state_pIDs[pID_r[0]] = True
-                                print "NEIGHBORS !"
-                                break
-
-
-    print 'neighbors:'
-    nb_neighbors_stated = 0
-    for pID,state in dict_neighbor_state_pIDs.iteritems():
-        if state:
-            nb_neighbors_stated += 1
-            print pID
-
-
-
-    ##############################################
-    # -- WRITE THE NEIGHBORS FILTERED READS FILE #
-    ##############################################
-    count_reads_written = 0
-    count_neighbors_reads_written = 0
-
-    # write the new filtered reads file (mutant-indels):
-    corrected_aligned_reads_fname = barcode_dir+'corrected_reads.fasta'
-    with open(corrected_aligned_reads_fname, 'w') as neighbors_filtered_reads_file:
-        for pID in dict_all_reads.keys(): # for all pIDs                                
-            if not dict_neighbor_state_pIDs[pID]: # if pID "is not a neighbor"
-                print 'write reads for pID: ' + pID
-
-                # write the reads corresponding to pID
-                for cur_read in dict_all_reads[pID]: # cur_read is a list [nb_occ_fwd,nb_occ_rev,seq]
-                    #pid_orig = pID
-                    #mut_flag = ''
-                    new_read_id = str(pID+'_'+str(cur_read[0])+'_'+str(cur_read[1])+'_'+pID)
-                    neighbors_filtered_reads_file.write('>'+new_read_id+'\n') # write the new id of the read
-                    neighbors_filtered_reads_file.write(cur_read[2]+'\n') # write the sequence
-                    count_reads_written += (cur_read[0]+cur_read[1])
-
-                # write the neighbors if there are
-                if (len(dict_neighbors[pID]) > 0):
-                    print '#neighbors pIDs: ' + str(len(dict_neighbors[pID]))
-                    for pID_n in dict_neighbors[pID]: # for all the neighbors pID_n of pID
-
-                        # write the neighbors pID_n
-                        for read_n in dict_all_reads[pID_n]: # read_n is a list [nb_occ_fwd,nb_occ_rev,seq]
-                            pid_orig = pID_n
-                            mut_flag = '_m'
-                            new_read_id = str(pID+'_'+str(read_n[0])+'_'+str(read_n[1])+'_'+pid_orig+mut_flag)
-                            neighbors_filtered_reads_file.write('>'+new_read_id+'\n') # write the new id of the read
-                            neighbors_filtered_reads_file.write(read_n[2]+'\n') # write the sequence
-                            count_reads_written += (read_n[0]+read_n[1])
-                            count_neighbors_reads_written += (read_n[0]+read_n[1])
-
-
-    print 'nb reads written: '+str(count_reads_written)
-    print 'nb neighbored reads written: '+str(count_neighbors_reads_written)
-    print 'nb neighbored pIDs: '+str(nb_neighbors_stated)
-
-
-else:
-    print auto_file_name + ': usage: '+ auto_file_name + ' <directory with reads from a barcode> <type of read>'