extended read me file public
authorRichard <richard.neher@tuebingen.mpg.de>
Thu, 26 Sep 2013 21:03:47 +0000 (23:03 +0200)
committerRichard <richard.neher@tuebingen.mpg.de>
Thu, 26 Sep 2013 21:03:47 +0000 (23:03 +0200)
README.txt
src/p3_cluster_bis_clean_logs_and_move_back_aligned_files.py [deleted file]
src/p4_consensus.py

index fa61c25..c8926a2 100644 (file)
@@ -18,5 +18,26 @@ COMMAND: python src/p3_cluster_align.py run_directory
 
 Starts a cluster job for each of the temp directory in each of the barcode directories inside the run_directory. 
 
+## step 4:
 
+COMMAND: python src/p4_consensus.py run_directory read_type
 
+This script goes over all barcodes in the run directory, gathers the aligned read files in the temporary directory of the desired read type, and builds consensus sequences. it also writes all aligned reads into a single file. 
+
+## step 5:
+
+COMMAND: python src/p5_decontamination.py bar_code_directory ref_seqs read_type true_seq_id
+
+This script takes the aligned reads from one barcode and checks whether the individuals reads or the consensus sequence alignes reasonably well to the reference sequence with the true_seq_id. If a read does not, it is checked against all other reference sequences. All reads that don't align well to their own reference sequence are written into an extra file. 
+
+alternatively to submit batch-jobs to the cluster:
+
+COMMAND: python src/p5_decontamination.py run_directory ref_seqs read_type true_seq_id
+
+## step 6:
+
+COMMAND: python src/p6_detect_mutants_indels.py barcode_dir read_type
+
+Check whether PIDs of low abundance reads are less than a certain edit distance from a high abundance one. Designate a neighbor if reads in addition align well. Produce read files with likely_pIDs and original PIDs. 
+
+After this step, the sorting, alignment and consensus steps (2-4) need to be redone with readtype corrected instead of filtered. 
diff --git a/src/p3_cluster_bis_clean_logs_and_move_back_aligned_files.py b/src/p3_cluster_bis_clean_logs_and_move_back_aligned_files.py
deleted file mode 100755 (executable)
index 6d97210..0000000
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/python
-
-# /!\: script that must be run ONLY after the end of the cluster jobs
-# - script that cleans the cluster logs in src directory 
-#   and moves the temp and aligned reads files from the cluster specific directory ( in ../templates) given in input
-#   to their corresponding temp and align directory in ../templates (barcode specific)
-
-import os
-import sys
-import time
-import glob
-from collections import Counter
-from collections import defaultdict
-import lib_tools as lt
-
-auto_file_name = str(sys.argv[0])
-
-        
-######
-
-if __name__=='__main__':
-
-    if (len(sys.argv)==2):
-
-        # 1. clean the cluster logs in src directory (normally the current directory)
-        os.system('rm -f p3_cluster_align_aux.py.e* p3_cluster_align_aux.py.o*')
-        print 'cluster logs p3_cluster_align_aux_py.e* and p3_cluster_align_aux.py.o* deleted'
-
-        # 2. move the temp and aligned reads files from the cluster directory to their specific temp/align directory
-        relative_path_to_cluster_dir = str(sys.argv[1])
-        if relative_path_to_cluster_dir[-1]!='/':
-            relative_path_to_cluster_dir+='/'
-
-        path_to_templates = "../templates/"
-        prefix_date_and_id=relative_path_to_cluster_dir.split('/')[2].split('cluster-')[1]
-        print prefix_date_and_id
-        
-        #list_files = os.popen('ls '+relative_path_to_cluster_dir+'*').readlines()
-        list_files = glob.glob(relative_path_to_cluster_dir+'*') 
-        total_nb_files_in_cluster_dir = len(list_files)
-        print total_nb_files_in_cluster_dir
-
-        count_files = Counter()
-        dict_files = defaultdict(list)
-        
-        for cur_file in list_files:
-            cur_file = cur_file.strip()
-            cur_file_base_name = cur_file.split('/')[-1]
-            cur_file_type, cur_file_bc = [cur_file_base_name.split('_')[i] for i in [1,2]]
-            count_files[cur_file_type]+=1
-            dict_files[(cur_file_type,cur_file_bc)].append(cur_file_base_name)
-
-        print '#files in cluster directory: '+str(count_files)
-        # print dict_files
-    
-        # move the files to their directory
-        for type_and_bc in dict_files.keys():
-            # create the aligned files directory if necessary
-            new_file_directory = str(path_to_templates+'dir-'+prefix_date_and_id+'_'+type_and_bc[0]+'_'+type_and_bc[1]+'/')
-            lt.check_and_create_directory(new_file_directory)
-            for cur_file in dict_files[type_and_bc]:            
-                print 'move file '+relative_path_to_cluster_dir+cur_file+' to the directory: '+new_file_directory
-                os.system('mv '+relative_path_to_cluster_dir+cur_file+' '+new_file_directory)
-                count_files[type_and_bc[0]]-=1
-
-        print '#remaining files in cluster directory: '+str(count_files)
-    
-    else:
-        print auto_file_name+': usage: '+auto_file_name+' <cluster directory (in ../templates/)>'
index afc6da6..7521b93 100755 (executable)
@@ -9,6 +9,7 @@ import numpy as np
 from Bio import AlignIO
 from Bio import SeqIO
 from Bio.SeqRecord import SeqRecord
+from Bio.Align.Applications import MuscleCommandline
 import os
 import shutil
 import sys
@@ -91,10 +92,10 @@ if __name__=='__main__':
             time_start = time.time()
             aligned_fname = lt.trim_extension(consensus_fname)+'_aligned.fasta'
             try:
-                cline = MuscleCommandline(input = fname, out = aligned_fname)
+                cline = MuscleCommandline(input = consensus_fname, out = aligned_fname)
                 cline()
             except:
-                print "Trouble aligning", fname
+                print "Trouble aligning", consensus_fname
             
             # read all aligned sequences back in, sort them and write to file again
             consensus_seqs, counts_good, counts_bad = lt.parse_readfile(aligned_fname)