+ modifications in order to use new data format / access methods
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 31 Jul 2008 09:50:53 +0000 (09:50 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 31 Jul 2008 09:50:53 +0000 (09:50 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@10268 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/run_specific_scripts/transcriptome_analysis/combine_spliced_map_parts.sh
tools/run_specific_scripts/transcriptome_analysis/compile_dataset.py
tools/run_specific_scripts/transcriptome_analysis/createNewDataset.py

index 89547df..415405d 100644 (file)
@@ -12,7 +12,7 @@ import qpalma
 import qpalma.tools
 
 from qpalma.parsers import *
-from qpalma.sequence_utils import *
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,reverse_complement,unbracket_seq,create_bracket_seq
 
 import QPalmaConfiguration as Conf
 
@@ -78,12 +78,23 @@ class DatasetGenerator:
       strand_map = ['-','+']
       instance_counter = 0
 
+      g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+      g_fmt = 'chr%d.dna.flat'
+      s_fmt = 'contig_%d%s'
+
+      num_chromo = 6
+
+      accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
+      seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+
       for line in open(map_file):
-         if instance_counter > 0 and instance_counter % 50000 == 0:
+         if instance_counter > 0 and instance_counter % 5000 == 0:
             print 'processed %d examples' % instance_counter
-
-         #if instance_counter == 10:
-         #   break
+            #if instance_counter == 10000:
+            #   break
 
          slist    = line.split()
 
@@ -95,23 +106,23 @@ class DatasetGenerator:
          strand   = slist[3]
          strand   = strand_map[strand == 'D']
 
-         if first_map:
-            read_seq = slist[7]
-         else:
-            read_seq = slist[11]
+         read_seq = slist[4]
 
          # QPalma uses lowercase characters
          read_seq = read_seq.lower()
+         read_seq = unbracket_seq(read_seq)
          
-         # fetch the three quality vectors
-         _prb      = slist[8]
-         _cal_prb  = slist[9]
-         _chastity = slist[10]
+         # fetch the prb quality vector
+         _prb      = slist[5]
 
          # for the time being we only consider chromosomes 1-5
          if not chromo in range(1,6):
             continue
 
+         if strand == '-':
+            pos = seqInfo.chromo_sizes[chromo]-pos-self.read_size
+            read_seq = reverse_complement(read_seq)
+
          # we use an area of +/- 1500 nucleotides around the seed position
          if pos > 1501:
             ds_offset = 1500
@@ -123,13 +134,10 @@ class DatasetGenerator:
          genomicSeq_start = pos - ds_offset
          genomicSeq_stop  = pos + us_offset
 
-         # fetch missing information from original reads
-         flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-
-         try:
-            dna_seq = get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,flat_files,True)
-         except:
-            pdb.set_trace()
+         #try:
+         dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
+         #except:
+         #   pdb.set_trace()
 
          # Take into account that the results of the second run do not contain
          # the original reads anymore.
@@ -142,17 +150,17 @@ class DatasetGenerator:
          # qualities. Each quality element can range as follows: -128 <= elem <= 127
          prb      = array.array('b',[0]*self.read_size)
          #cal_prb  = array.array('b',[0]*self.read_size)
-         chastity = array.array('b',[0]*self.read_size)
+         #chastity = array.array('b',[0]*self.read_size)
 
          # convert the ascii-encoded quality scores
          for idx in range(self.read_size):
             prb[idx]       = ord(_prb[idx])-50
             #cal_prb[idx]   = ord(_cal_prb[idx])-64
-            chastity[idx]  = ord(_chastity[idx])+10
+            #chastity[idx]  = ord(_chastity[idx])+10
 
          # add instance to set
          currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
-         currentQualities = (prb,chastity)
+         currentQualities = (prb)
 
          # as one single read can have several vmatch matches we store all
          # these matches under the unique id of the read
index e32dde5..8dea2c5 100644 (file)
@@ -2,26 +2,26 @@
 # -*- coding: utf-8 -*-
 
 import os.path
+import cProfile
 
 from compile_dataset import DatasetGenerator
 
-jp = os.path.join
 
-#working_dir='/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
-#working_dir='/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
-#working_dir='/fml/ag-raetsch/share/projects/qpalma/solexa/new_run2/mapping/spliced'
+def run():
+   jp = os.path.join
 
-main_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_0/'
+   main_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
 
-#spliced_dir = '/media/oka_raid/backup/data/solexa_analysis/ATH/Transcriptome/Col-0/run_44/4/length_38/spliced'
-#result_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/'
+   spliced_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1'
+   result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset'
 
-spliced_dir = '/media/oka_raid/backup/data/solexa_analysis/ATH/Transcriptome/Col-0/run_44/4/length_38/spliced_3'
-result_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/'
+   map_1_fn = jp(main_dir,'map.vm.spliced')
+   map_2_fn = jp(spliced_dir,'map.vm')
 
-map_1_fn = jp(main_dir,'map.vm.spliced')
-map_2_fn = jp(spliced_dir,'map.vm')
+   dg = DatasetGenerator(map_1_fn,map_2_fn)
+   dg.compile_dataset()
+   dg.saveAs(jp(result_dir,'dataset_run_1.pickle'))
 
-dg = DatasetGenerator(map_1_fn,map_2_fn)
-dg.compile_dataset()
-dg.saveAs(jp(result_dir,'dataset_run_2.pickle'))
+if __name__ == '__main__':
+   #cProfile.run('run()')
+   run()