+ added dataset generation function to dataset specific scripts dir.
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 30 May 2008 09:25:43 +0000 (09:25 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 30 May 2008 09:25:43 +0000 (09:25 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9299 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/run_specific_scripts/transcriptome_analysis/compile_dataset.py [new file with mode: 0644]
tools/run_specific_scripts/transcriptome_analysis/createFullMap.sh
tools/run_specific_scripts/transcriptome_analysis/createNewDataset.py [new file with mode: 0644]

diff --git a/tools/run_specific_scripts/transcriptome_analysis/compile_dataset.py b/tools/run_specific_scripts/transcriptome_analysis/compile_dataset.py
new file mode 100644 (file)
index 0000000..d25c392
--- /dev/null
@@ -0,0 +1,276 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import random
+import os
+import re
+import pdb
+import cPickle
+
+import numpy
+from numpy.matlib import mat,zeros,ones,inf
+
+import qpalma
+import qpalma.tools
+
+from qpalma.parsers import *
+
+from Genefinding import *
+
+from genome_utils import load_genomic
+
+import qpalma.Configuration as Conf
+
+
+class DatasetGenerator:
+   """
+
+   """
+   
+   def __init__(self,map_file,map_2nd_file):
+      assert os.path.exists(map_file), 'Error: Can not find map file'
+      assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
+      self.map_file = map_file
+      self.map_2nd_file = map_2nd_file
+
+      self.dataset = []
+
+
+   def saveAs(self,dataset_file):
+      dataset_fn = '%s.pickle'% dataset_file
+      dataset_keys_fn = '%s.keys.pickle'%dataset_file
+
+      assert not os.path.exists(dataset_fn), 'The data_file already exists!'
+      assert not os.path.exists(dataset_keys_fn), 'The data_keys file already exists!'
+
+      # saving new dataset and single keys as well
+      cPickle.dump(self.dataset,open(dataset_fn,'w+'),protocol=2)
+      cPickle.dump(self.dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)
+
+
+   def parse_map_file(self,dataset,map_file):
+      strand_map = ['-','+']
+      instance_counter = 1
+
+      for line in open(map_file):
+         if instance_counter % 1001 == 0:
+            print 'processed %d examples' % instance_counter
+
+         line  = line.strip()
+         slist = line.split()
+         id    = int(slist[0])
+         chromo   = int(slist[1])
+         pos      = int(slist[2])
+         strand   = slist[3]
+         strand   = strand_map[strand == 'D']
+
+         if strand != '+':
+            continue
+
+         if not chromo in range(1,6):
+            continue
+
+         genomicSeq_start = pos - 1500
+         genomicSeq_stop  = pos + 1500
+
+         # fetch missing information from original reads
+         filteredRead = self.all_filtered_reads[id]
+         original_est = filteredRead['seq']
+         original_est = original_est.lower()
+
+         original_est = filteredRead['seq']
+         prb     = filteredRead['prb']
+         cal_prb = filteredRead['cal_prb']
+         chastity = filteredRead['chastity']
+
+         # add instance to set
+         currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
+         currentQualities = (prb,cal_prb,chastity)
+
+         # as one single read can have several vmatch matches we store all
+         # these matches under the unique id of the read
+         if dataset.has_key(id):
+            old_entry = dataset[id]
+            old_entry.append((currentSeqInfo,original_est,currentQualities))
+            dataset[id] = old_entry
+         else:
+            dataset[id] = [(currentSeqInfo,original_est,currentQualities)]
+
+         instance_counter += 1
+
+      print 'Total examples processed: %d' % instance_counter
+
+      return dataset
+
+
+   def compile_dataset(self):
+      dataset = {}
+
+      # usually we have two files to parse:
+      # the map file from the second run and a subset of the map file from the
+      # first run
+      dataset = self.parse_map_file(dataset,self.map_file)
+      dataset = self.parse_map_file(dataset,self.map_2nd_file)
+
+      # store the full set
+      self.testing_set = dataset
+
+
+
+
+def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
+   """
+   Now we want to use interval_query to get the predicted splice scores trained
+   on the TAIR sequence and annotation.
+   """
+
+   size = intervalEnd-intervalBegin
+   assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
+
+   acc = size*[0.0]
+   don = size*[0.0]
+
+   interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
+   pos_size = new_intp()
+   intp_assign(pos_size,1)
+
+   # fetch acceptor scores
+   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
+   acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
+
+   # fetch donor scores
+   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
+   don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
+
+   return acc, don
+
+
+def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
+   """
+   This function expects an interval, chromosome and strand information and
+   returns then the genomic sequence of this interval and the associated scores.
+   """
+
+   assert genomicSeq_start < genomicSeq_stop
+
+   chrom         = 'chr%d' % chr
+   genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
+   genomicSeq = genomicSeq.lower()
+
+   # check the obtained dna sequence
+   assert genomicSeq != '', 'load_genomic returned empty sequence!'
+   
+   # all entries other than a c g t are set to n
+   no_base = re.compile('[^acgt]')
+   genomicSeq = no_base.sub('n',genomicSeq)
+
+   intervalBegin  = genomicSeq_start-100
+   intervalEnd    = genomicSeq_stop+100
+   seq_pos_offset = genomicSeq_start
+
+   currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
+
+   currentAcc = currentAcc[100:-98]
+   currentAcc = currentAcc[1:]
+   currentDon = currentDon[100:-100]
+
+   length = len(genomicSeq)
+   currentAcc = currentAcc[:length]
+
+   currentDon = currentDon+[-inf]*(length-len(currentDon))
+
+   ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
+   gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+
+   assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
+   assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
+   assert len(currentAcc) == len(currentDon)
+
+   return genomicSeq, currentAcc, currentDon
+
+
+def reverse_complement(seq):
+   map = {'a':'t','c':'g','g':'c','t':'a'}
+
+   new_seq = [map[elem] for elem in seq]
+   new_seq.reverse()
+   new_seq = "".join(new_seq)
+
+   return new_seq
+
+
+def get_seq(begin,end,exon_end):
+   """
+   """
+
+   dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+
+   if exon_end:
+      gene_start = begin
+      gene_stop  = end+2
+   else:
+      gene_start = begin-2
+      gene_stop  = end
+
+   chrom         = 'chr%d' % 1
+   strand        = '+'
+
+   genomicSeq = load_genomic(chrom,strand,gene_start,gene_stop,dna_flat_files,one_based=False)
+   genomicSeq = genomicSeq.lower()
+
+   return genomicSeq
+
+
+def parseLine(line):
+   """
+   We assume that a line has the following entries:
+   
+   read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
+
+   """
+   #try:
+   id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = line.split()
+   #except:
+   #   id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
+   #   true_cut = -1
+
+   splitpos = int(splitpos)
+   read_size = int(read_size)
+
+   seq=seq.lower()
+
+   assert strand in ['D','P']
+
+   if strand == 'D':
+      strand = '+'
+
+   if strand == 'P':
+      strand = '-'
+
+   chr = int(chr)
+
+   prb = [ord(elem)-50 for elem in prb]
+   cal_prb = [ord(elem)-64 for elem in cal_prb]
+   chastity = [ord(elem)+10 for elem in chastity]
+
+   p_start = int(p_start)
+   exon_stop = int(exon_stop)
+   exon_start = int(exon_start)
+   p_stop = int(p_stop)
+   true_cut = int(true_cut)
+
+   line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
+   'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
+   'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
+   'p_stop':p_stop,'true_cut':true_cut}
+
+   return line_d
+
+
+if __name__ == '__main__':
+   if len(sys.argv) == 1:
+      print info
+
+   assert len(sys.argv) == 6, help
+   compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)
index 439c343..06b3b92 100755 (executable)
@@ -12,24 +12,31 @@ function combine_and_check_maps {
    round_dir=$2
    # this is the result file ( map.vm or map_2nd.vm)
    new_map=$3
+   logfile=$4
+
+   echo "Starting logfile entries for $new_map" > $logfile
 
    current_map=$run_dir/1/length_38/${round_dir}/map.vm
    echo "processing $current_map ..."
    cat  $current_map > $new_map
    lane1_read_ctr=`cut -f1 $current_map | sort -u | wc -l`
+   echo "lane1 reads $lane1_read_ctr" >> $logfile
 
    current_map=$run_dir/2/length_38/${round_dir}/map.vm
    echo "processing $current_map ..."
    cat $current_map >> $new_map
    lane2_read_ctr=`cut -f1 $current_map | sort -u | wc -l`
+   echo "lane2 reads $lane2_read_ctr" >> $logfile
 
    current_map=$run_dir/3/length_38/${round_dir}/map.vm
    echo "processing $current_map ..."
    cat $current_map >> $new_map
    lane3_read_ctr=`cut -f1 $current_map | sort -u | wc -l`
+   echo "lane3 reads $lane3_read_ctr" >> $logfile
 
    combined_lanes_ctr=`echo "$lane1_read_ctr + $lane2_read_ctr + $lane3_read_ctr" | bc`
    combined_read_ctr=`cut -f1 $new_map | sort -u | wc -l`
+   echo "$new_map reads $combined_read_ctr" >> $logfile
 
    # here we check whether the number of reads of the combined file is the sum
    # of the reads of the single lane files
@@ -43,6 +50,7 @@ run_dir=/media/oka_raid/backup/data/solexa_analysis/ATH/Transcriptome/Col-0/run_
 #mkdir /tmp/fabio
 map_1st=/tmp/fabio/map.vm
 map_2nd=/tmp/fabio/map_2nd.vm
+logfile=/tmp/fabio/logfile.txt
 
-combine_and_check_maps $run_dir "main" $map_1st
-combine_and_check_maps $run_dir "spliced" $map_2nd
+#combine_and_check_maps $run_dir "main" $map_1st $logfile
+#combine_and_check_maps $run_dir "spliced" $map_2nd $logfile
diff --git a/tools/run_specific_scripts/transcriptome_analysis/createNewDataset.py b/tools/run_specific_scripts/transcriptome_analysis/createNewDataset.py
new file mode 100644 (file)
index 0000000..06961a2
--- /dev/null
@@ -0,0 +1,12 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import qpalma.Configuration as Conf
+from compile_dataset import DatasetGenerator
+
+map_1_fn = '/tmp/fabio/spliced.heuristic'
+map_2_fn = '/tmp/fabio/map_2nd.vm'
+
+dg = DatasetGenerator(map_1_fn,map_2_fn)
+dg.compile_dataset()
+dg.saveAs('/tmp/fabio/dataset_transcriptome_run_1')