+ optimized dataset generation/storage
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 20 Jun 2008 12:31:35 +0000 (12:31 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 20 Jun 2008 12:31:35 +0000 (12:31 +0000)
+ added some assertions

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9694 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/run_specific_scripts/transcriptome_analysis/README
tools/run_specific_scripts/transcriptome_analysis/combine_spliced_map_parts.sh [new file with mode: 0755]
tools/run_specific_scripts/transcriptome_analysis/compile_dataset.py
tools/run_specific_scripts/transcriptome_analysis/createFullMap.sh
tools/run_specific_scripts/transcriptome_analysis/createNewDataset.py

index 55d4976..0a20713 100644 (file)
@@ -1,5 +1,10 @@
-Steps
------
+----------------
+Processing Steps
+----------------
+
+
+Data Generation
+---------------
 
 The data was taken from
 
@@ -7,7 +12,50 @@ The data was taken from
 
 directories 1 to 3
 
-We combined the map.vm and map_2nd.vm for all these three dirs.
+We combined the map.vm and map_2nd.vm for all these three dirs using 'createFullMap.sh'
+
+the data was saved in dir /fml/ag-raetsch/home/fabio/tmp/transcriptome_data.
+
+
+QPalma Heuristic
+----------------
+
+Next we split up map.vm to make a parallel run of the QPalma heuristic in order
+to estimate the entries in map.vm that should be aligned by QPalma.
+
+We combine the results using 'combine_spliced_map_parts.sh'
+
+
+QPalma Dataset Generation
+-------------------------
+
+Once we have the map_2nd.vm and the map.vm.spliced files we create a QPalma
+dataset using 'createNewDataset.py'
+
+
+
+
+Coverage Number Estimation
+--------------------------
+
+For this purpose we need the raw fasta files for the original reads.  In the
+directory 'raw_reads' there should be a file 'reads_0.fa' containing all
+original reads in fasta format.
+
+
+
+
+New Vmatch criteria:
+
+
+Old settings 1 mismatch:
+
+/media/oka_raid/backup/data/solexa_analysis/ATH/Transcriptome/Col-0/run_44/4/length_38/spliced
+
+
 
-fabio@congo:/fml/ag-raetsch/share/projects/qpalma/solexa/scripte/first_transcriptome_run$ 
+2 Mismatches (stepwise trimming -3 from 35 to 20)
+/media/oka_raid/backup/data/solexa_analysis/ATH/Transcriptome/Col-0/run_44/4/length_38/spliced_3
 
diff --git a/tools/run_specific_scripts/transcriptome_analysis/combine_spliced_map_parts.sh b/tools/run_specific_scripts/transcriptome_analysis/combine_spliced_map_parts.sh
new file mode 100755 (executable)
index 0000000..5959484
--- /dev/null
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+touch map.vm.spliced
+rm map.vm.spliced
+
+for((idx=0;idx<10;idx++))
+do
+   cat map.vm.part_${idx}.heuristic.spliced >> map.vm.spliced
+done
+
+
+
index 600e420..17afe77 100644 (file)
@@ -2,45 +2,29 @@
 # -*- coding: utf-8 -*-
 
 import sys
+import array
 import random
 import os
 import re
 import pdb
 import cPickle
 
-#import numpy
-#from numpy.matlib import mat,zeros,ones,inf
-
 import qpalma
 import qpalma.tools
 
 from qpalma.parsers import *
 from qpalma.sequence_utils import *
 
-#from Genefinding import *
-#from genome_utils import load_genomic
-
 import qpalma.Configuration as Conf
 
-def create_bracket_seq(dna_seq,read_seq):
-   """
-   This function takes a dna sequence and a read sequence and returns the
-   bracket format of the match/mismatches i.e.
-
-   dna : aaa
-   read: aac
-   is written in bracket notation: aa[ac]
-   """
-   assert len(dna_seq) == len(read_seq)
-   return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
-      
+#
+# This script takes as input the map.vm map_2nd.vm files and generates QPalma
+# dataset pickle files
+#
 
 
 class DatasetGenerator:
-   """
 
-   """
-   
    def __init__(self,map_file,map_2nd_file):
       assert os.path.exists(map_file), 'Error: Can not find map file'
       assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
@@ -49,6 +33,8 @@ class DatasetGenerator:
 
       self.dataset = []
 
+      self.read_size = 38
+
 
    def saveAs(self,dataset_file):
       dataset_fn = '%s.pickle'% dataset_file
@@ -63,18 +49,23 @@ class DatasetGenerator:
 
 
    def parse_map_file(self,dataset,map_file,first_map):
+      """
+      """
+
       strand_map = ['-','+']
-      instance_counter = 1
+      instance_counter = 0
 
       for line in open(map_file):
-         if instance_counter % 1001 == 0:
+         if instance_counter > 0 and instance_counter % 50000 == 0:
             print 'processed %d examples' % instance_counter
 
-         line  = line.strip()
-         slist = line.split()
-         id    = int(slist[0])
+         slist    = line.split()
+
+         id       = int(slist[0])
          chromo   = int(slist[1])
          pos      = int(slist[2])
+
+         # convert D/P strand info to +/-
          strand   = slist[3]
          strand   = strand_map[strand == 'D']
 
@@ -83,50 +74,68 @@ class DatasetGenerator:
          else:
             read_seq = slist[11]
 
+         # QPalma uses lowercase characters
+         read_seq = read_seq.lower()
+         
+         # fetch the three quality vectors
          _prb      = slist[8]
          _cal_prb  = slist[9]
          _chastity = slist[10]
 
-         if strand != '+':
-            continue
-
+         # for the time being we only consider chromosomes 1-5
          if not chromo in range(1,6):
             continue
 
-         genomicSeq_start = pos - 1500
-         genomicSeq_stop  = pos + 1500
+         # we use an area of +/- 1500 nucleotides around the seed position
+         if pos > 1501:
+            ds_offset = 1500
+         else:
+            ds_offset = pos-1
+
+         us_offset = 1500
+            
+         genomicSeq_start = pos - ds_offset
+         genomicSeq_stop  = pos + us_offset
 
          # fetch missing information from original reads
          flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-         original_info = get_seq_and_scores(chromo,strand,pos,pos+35,flat_files)
-         dna_seq = original_info[0]
 
-         #filteredRead = self.all_filtered_reads[id]
-         #original_est = filteredRead['seq']
-         original_est = create_bracket_seq(dna_seq,read_seq)
+         try:
+            dna_seq = get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,flat_files,True)
+         except:
+            pdb.set_trace()
+
+         # Take into account that the results of the second run do not contain
+         # the original reads anymore.
+         if first_map:
+            original_est = read_seq
+         else:
+            original_est = create_bracket_seq(dna_seq[ds_offset:ds_offset+self.read_size],read_seq)
 
-      for idx in range(read_size):
-         prb[idx]       = ord(_prb[idx])-50
-         cal_prb[idx]   = ord(_cal_prb[idx])-64
-         chastity[idx]  = ord(_chastity[idx])+10
+         # in order to save some space we use a signed char to store the
+         # qualities. Each quality element can range as follows: -128 <= elem <= 127
+         prb      = array.array('b',[0]*self.read_size)
+         #cal_prb  = array.array('b',[0]*self.read_size)
+         chastity = array.array('b',[0]*self.read_size)
+
+         # convert the ascii-encoded quality scores
+         for idx in range(self.read_size):
+            prb[idx]       = ord(_prb[idx])-50
+            #cal_prb[idx]   = ord(_cal_prb[idx])-64
+            chastity[idx]  = ord(_chastity[idx])+10
 
          # add instance to set
          currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
-         currentQualities = (prb,cal_prb,chastity)
+         currentQualities = (prb,chastity)
 
          # as one single read can have several vmatch matches we store all
          # these matches under the unique id of the read
-         if dataset.has_key(id):
-            old_entry = dataset[id]
-            old_entry.append((currentSeqInfo,original_est,currentQualities))
-            dataset[id] = old_entry
-         else:
-            dataset[id] = [(currentSeqInfo,original_est,currentQualities)]
+         dataset.setdefault(id, []).append((currentSeqInfo,original_est,currentQualities))
 
          instance_counter += 1
 
-      print 'Total examples processed: %d' % instance_counter
 
+      print 'Total examples processed: %d' % instance_counter
       return dataset
 
 
@@ -139,5 +148,4 @@ class DatasetGenerator:
       dataset = self.parse_map_file(dataset,self.map_file,True)
       dataset = self.parse_map_file(dataset,self.map_2nd_file,False)
 
-      # store the full set
-      self.testing_set = dataset
+      self.dataset = dataset
index 06b3b92..af45329 100755 (executable)
@@ -14,7 +14,7 @@ function combine_and_check_maps {
    new_map=$3
    logfile=$4
 
-   echo "Starting logfile entries for $new_map" > $logfile
+   echo "Starting logfile entries for $new_map" >> $logfile
 
    current_map=$run_dir/1/length_38/${round_dir}/map.vm
    echo "processing $current_map ..."
@@ -48,9 +48,11 @@ function combine_and_check_maps {
 run_dir=/media/oka_raid/backup/data/solexa_analysis/ATH/Transcriptome/Col-0/run_40
 
 #mkdir /tmp/fabio
-map_1st=/tmp/fabio/map.vm
-map_2nd=/tmp/fabio/map_2nd.vm
-logfile=/tmp/fabio/logfile.txt
+map_1st=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/map.vm
+map_2nd=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/map_2nd.vm
+logfile=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/logfile.txt
 
-#combine_and_check_maps $run_dir "main" $map_1st $logfile
-#combine_and_check_maps $run_dir "spliced" $map_2nd $logfile
+touch $logfile && rm $logfile
+
+combine_and_check_maps $run_dir "main" $map_1st $logfile
+combine_and_check_maps $run_dir "spliced" $map_2nd $logfile
index ca1b7c7..89fd86c 100644 (file)
@@ -1,13 +1,19 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+import os.path
+
 import qpalma.Configuration as Conf
 from compile_dataset import DatasetGenerator
 
-#map_1_fn = '/tmp/fabio/spliced.heuristic'
-map_1_fn = '/tmp/fabio/map_2nd.vm'
-map_2_fn = '/tmp/fabio/map_2nd.vm'
+jp = os.path.join
+
+working_dir='/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
+result_dir='/fml/ag-raetsch/home/fabio/tmp/sandbox'
+
+map_1_fn = jp(working_dir,'map.vm.spliced')
+map_2_fn = jp(working_dir,'map_2nd.vm')
 
 dg = DatasetGenerator(map_1_fn,map_2_fn)
 dg.compile_dataset()
-dg.saveAs('/tmp/fabio/dataset_transcriptome_run_1')
+dg.saveAs(jp(result_dir,'dataset_transcriptome_run_1'))