added run specific config files, changed p1 to accomodate this
authorRichard <richard.neher@tuebingen.mpg.de>
Wed, 25 Sep 2013 15:29:55 +0000 (17:29 +0200)
committerRichard <richard.neher@tuebingen.mpg.de>
Wed, 25 Sep 2013 15:29:55 +0000 (17:29 +0200)
.gitignore
configFile_454_run1.py
configFile_454_run2.py
src/p1_trim_and_filter.py
src/p2_sort.py
src/p2_sort_all.py
src/p3_cluster_align.py
src/p4_consensus.py
src/p7_decontamination.py
src/p8_detect_mutants_indels.py

index da34716..786d9af 100644 (file)
@@ -9,6 +9,7 @@ auto
 *fastq
 region_.tex
 *txt
+*.py.*
 
 ### /home/fabio/.gitignore-boilerplates/Global/vim.gitignore
 *tar.gz
index 3314898..54a2bd1 100644 (file)
@@ -6,12 +6,11 @@
 # 454 #
 #######
 cfg={
-    'runid':'454_subsample_run1',
+    'runid':'PID_454_run1',
     'p5_virus_match':'GTAGCATGACAAAAATCTTAGAGCC',
     'p3_virus_match':'CATTRCTTTGGATGGGTATGAA',
     'barcodes':['ACG','CAG','GTA', 'GTC'],
-#    'input_data_file':'../data/rawdata_reg2.fsa',
-    'input_data_file':'data/subsample.fasta',
+    'input_data_file':'/ebio/ag-neher/share/data/PID_454/rawdata_reg1.fsa',
     'p5_cutoff': 21,#=len(p5_virus_match)-4
     'p3_cutoff': 18,#=len(p3_virus_match)-4
     'min_occ_same_pid':1,
index 445422c..e99c354 100644 (file)
@@ -6,12 +6,12 @@
 # 454 #
 #######
 cfg={
-    'runid':'454_subsample_run2',
+    'runid':'PID_454_run2',
     'p5_virus_match':'GTAGCATGACAAAAATCTTAGAGCC',
     'p3_virus_match':'CATTRCTTTGGATGGGTATGAA',
     'barcodes':['ACG','CGT','TAC'],
 #    'input_data_file':'../data/rawdata_reg2.fsa',
-    'input_data_file':'data/subsample.fasta',
+    'input_data_file':'/ebio/ag-neher/share/data/PID_454/rawdata_reg2.fsa',
     'p5_cutoff': 21,#=len(p5_virus_match)-4
     'p3_cutoff': 18,#=len(p3_virus_match)-4
     'min_occ_same_pid':1,
index e944905..c317a23 100755 (executable)
@@ -141,6 +141,8 @@ def filter_reads(res):
     time_start = time.time()
     with open(str(res.input_data_file), 'r') as seq_file:
         file_format = res.input_data_file.split('.')[-1]
+        if file_format=='fsa':
+            file_format = 'fasta'
         print('opened file '+res.input_data_file+' '+file_format+'\n')
         for record in SeqIO.parse(seq_file, file_format):
             tmp_seq = str(record.seq)
index aed6b82..d478322 100755 (executable)
@@ -35,7 +35,7 @@ class struct_var_set:
         #self.templates={}
         self.barcode = 'NYD'
 
-batchsize = 10
+batchsize = 100
 ######
 # MAIN
 ######
index d0f8ae7..f79dcff 100644 (file)
@@ -2,10 +2,11 @@ import sys
 import glob
 import subprocess as sp
 
-if len(sys.argv)==2:
+if len(sys.argv)==3:
     rundir = sys.argv[1].rstrip('/')+'/'
-    
+    readtype = sys.argv[2]
+
     reads_by_barcode = glob.glob(rundir+'bc*.fasta')
-    for bc_file in reads_by_barcode:
-        sp.call(['python', 'src/p2_sort.py',bc_file])
+    for bc_dir in reads_by_barcode:
+        sp.call(['python', 'src/p2_sort.py',bc_file, readtype])
 
index 1d00530..c7febe8 100755 (executable)
@@ -22,7 +22,7 @@ if(len(sys.argv)==2):
         list_temp_dirs = glob.glob(analysis_dir+'/temp_*')
         #run one job per file to align on the cluster
         for temp_dir in list_temp_dirs:
-            cmd = 'qsub -cwd -l h_rt=12:00:00 -l h_vmem=10G ./src/p3_align.py '+temp_dir
+            cmd = 'qsub -cwd -l h_rt=00:59:00 -l h_vmem=10G ./src/p3_align.py '+temp_dir
             print cmd
             os.system(cmd)
 
index 86dcd01..d5668ab 100755 (executable)
@@ -71,7 +71,6 @@ if __name__=='__main__':
 
             with open(consensus_fname, 'w') as consensus_file, \
                     open(aligned_reads_fname, 'w') as aligned_reads_file:
-                print temp_directories
                 for temp_dir in temp_directories:
                     pID_files = glob.glob(temp_dir+'/*aligned.fasta')
                     for pID_file in pID_files:
index ee29548..5d4839d 100755 (executable)
@@ -36,7 +36,7 @@ if (len(sys.argv)==5):
     dict_ref_seq ={}
     with open(ref_seq_file_name, 'r') as infile:
         for seq in SeqIO.parse(infile, 'fasta'):
-            dict_ref_seq[seq.id]=str(seq.seq)
+            dict_ref_seq[seq.description]=str(seq.seq)
 
 
     #2.parse the consensus sequences files for the given prefix_date_and_id in dict_cons_seq
@@ -167,7 +167,6 @@ if (len(sys.argv)==5):
 
     # write the decontaminated aligned filtered reads file
     with open(barcode_dir+'aligned_reads_'+readtype+'_decontaminated.fasta','w') as outfile:
-    #with open(barcode_dir+readtype+'_decontaminated_reads.fasta','w') as outfile:
         print 'write decontaminated file for run-barcode: '+str(true_seq_id)
         for record in list_of_reads_to_write_in_decontaminated_file:
             outfile.write('>'+str(record.id)+'\n')
@@ -182,6 +181,14 @@ if (len(sys.argv)==5):
             outfile.write(str(record.seq)+'\n')
         print '-- #reads written: '+str(len(list_of_reads_to_write_in_bad_aligned_file))
 
+    # write the consensus sequences file
+    with open(barcode_dir+'consensus_sequences_'+readtype+'_decontaminated.fasta','w') as outfile:
+        print 'write decontaminated file for run-barcode: '+str(true_seq_id)
+        for pID,rec in dict_cons_seq.iteritems():
+            if pID not in dict_reclassif:
+                outfile.write('>'+lt.read_label(pID, rec[0][0], rec[0][1])+'\n')
+                outfile.write(rec[0][2]+'\n')
+
 
     # 9. print decontamination statistics
     print '#############'
index afe17c1..e262b27 100755 (executable)
@@ -168,7 +168,7 @@ if (len(sys.argv) == 3):
     count_neighbors_reads_written = 0
 
     # write the new filtered reads file (mutant-indels):
-    corrected_aligned_reads_fname = barcode_dir+'corrected_aligned_reads.fasta'
+    corrected_aligned_reads_fname = barcode_dir+'corrected_reads.fasta'
     with open(corrected_aligned_reads_fname, 'w') as neighbors_filtered_reads_file:
         for pID in dict_all_reads.keys(): # for all pIDs                                
             if not dict_neighbor_state_pIDs[pID]: # if pID "is not a neighbor"