+ moved qpalma dataset generation scripts
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 30 May 2008 09:08:13 +0000 (09:08 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 30 May 2008 09:08:13 +0000 (09:08 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9298 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/qpalma_dataset_generation/compile_dataset.py [new file with mode: 0644]
tools/qpalma_dataset_generation/createNewDataset.py [new file with mode: 0644]

diff --git a/tools/qpalma_dataset_generation/compile_dataset.py b/tools/qpalma_dataset_generation/compile_dataset.py
new file mode 100644 (file)
index 0000000..a125664
--- /dev/null
@@ -0,0 +1,435 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import random
+import os
+import re
+import pdb
+import cPickle
+
+import numpy
+from numpy.matlib import mat,zeros,ones,inf
+
+import qpalma
+import qpalma.tools
+
+from qpalma.parsers import *
+
+from Genefinding import *
+
+from genome_utils import load_genomic
+
+import qpalma.Configuration as Conf
+
+
+class DatasetGenerator:
+   """
+
+   """
+   
+   def __init__(self,filtered_reads,map_file,map_2nd_file):
+      assert os.path.exists(filtered_reads), 'Error: Can not find reads file'
+      self.filtered_reads = filtered_reads
+
+      assert os.path.exists(map_file), 'Error: Can not find map file'
+      assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
+      self.map_file = map_file
+      self.map_2nd_file = map_2nd_file
+
+      self.training_set = []
+      self.testing_set  = []
+
+      print 'parsing filtered reads..'
+      self.all_filtered_reads = parse_filtered_reads(self.filtered_reads)
+      print 'found %d filtered reads' % len(self.all_filtered_reads)
+
+
+   def saveAs(self,dataset_file):
+      assert not os.path.exists(dataset_file), 'The data_file already exists!'
+
+      # saving new datasets
+
+      #all_keys = self.training_set.keys()
+      #random.shuffle(all_keys)
+      #training_keys = all_keys[0:10000]
+      #cPickle.dump(self.training_set,open('%s.train.pickle'%dataset_file,'w+'),protocol=2)
+      #cPickle.dump(training_keys,open('%s.train_keys.pickle'%dataset_file,'w+'),protocol=2)
+
+      cPickle.dump(self.testing_set,open('%s.test.pickle'%dataset_file,'w+'),protocol=2)
+
+      prediction_keys = [0]*len(self.testing_set.keys())
+      for pos,key in enumerate(self.testing_set.keys()):
+         prediction_keys[pos] = key
+
+      cPickle.dump(prediction_keys,open('%s.test_keys.pickle'%dataset_file,'w+'),protocol=2)
+
+
+   def compile_training_set(self):
+      # this stores the new dataset
+      dataset = {}
+
+      # Iterate over all remapped reads in order to generate for each read a
+      # training / prediction example
+      instance_counter = 1
+      skipped_ctr = 0
+
+      for id,filteredRead in self.all_filtered_reads.items():
+         if instance_counter % 1001 == 0:
+            print 'processed %d examples' % instance_counter
+
+         # training set consists only of spliced reads
+         if not id < 1000000300000:
+            continue 
+
+         if filteredRead['strand'] != '+':
+            skipped_ctr += 1
+            continue
+
+         if not filteredRead['chr'] in range(1,6):
+            skipped_ctr += 1
+            continue
+
+         # we cut out the genomic region for training 
+         CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
+         genomicSeq_start  = filteredRead['p_start'] - CUT_OFFSET
+         genomicSeq_stop   = filteredRead['p_stop'] + CUT_OFFSET
+
+         # information of the full read we need for training
+         chromo         = filteredRead['chr']
+         strand         = filteredRead['strand']
+         original_est   = filteredRead['seq']
+         quality        = filteredRead['prb']
+         cal_prb        = filteredRead['cal_prb']
+         chastity       = filteredRead['chastity']
+
+         currentExons = zeros((2,2),dtype=numpy.int)
+         currentExons[0,0] = filteredRead['p_start']
+         currentExons[0,1] = filteredRead['exon_stop']
+         currentExons[1,0] = filteredRead['exon_start']
+         currentExons[1,1] = filteredRead['p_stop']
+
+         # add instance to set
+         currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
+         currentQualities = (quality,cal_prb,chastity)
+
+         dataset[id] = (currentSeqInfo,currentExons,original_est,currentQualities)
+
+         instance_counter += 1
+
+      print 'Full dataset has size %d' % len(dataset)
+      print 'Skipped %d reads' % skipped_ctr
+
+      self.training_set = dataset
+      
+
+   def parse_map_file(self,dataset,map_file):
+      strand_map = ['-','+']
+      instance_counter = 1
+
+      for line in open(map_file):
+         if instance_counter % 1001 == 0:
+            print 'processed %d examples' % instance_counter
+
+         line  = line.strip()
+         slist = line.split()
+         id    = int(slist[0])
+         chromo   = int(slist[1])
+         pos      = int(slist[2])
+         strand   = slist[3]
+         strand   = strand_map[strand == 'D']
+
+         if strand != '+':
+            continue
+
+         if not chromo in range(1,6):
+            continue
+
+         genomicSeq_start = pos - 1500
+         genomicSeq_stop  = pos + 1500
+
+         # fetch missing information from original reads
+         filteredRead = self.all_filtered_reads[id]
+         original_est = filteredRead['seq']
+         original_est = original_est.lower()
+
+         original_est = filteredRead['seq']
+         prb     = filteredRead['prb']
+         cal_prb = filteredRead['cal_prb']
+         chastity = filteredRead['chastity']
+
+         # add instance to set
+         currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
+         currentQualities = (prb,cal_prb,chastity)
+
+         # as one single read can have several vmatch matches we store all
+         # these matches under the unique id of the read
+         if dataset.has_key(id):
+            old_entry = dataset[id]
+            old_entry.append((currentSeqInfo,original_est,currentQualities))
+            dataset[id] = old_entry
+         else:
+            dataset[id] = [(currentSeqInfo,original_est,currentQualities)]
+
+         instance_counter += 1
+
+      print 'Total examples processed: %d' % instance_counter
+
+      return dataset
+
+
+   def compile_testing_set(self):
+
+      dataset = {}
+
+      # usually we have two files to parse:
+      # the map file from the second run and a subset of the map file from the
+      # first run
+      dataset = self.parse_map_file(dataset,self.map_file)
+      dataset = self.parse_map_file(dataset,self.map_2nd_file)
+
+      # store the full set
+      self.testing_set = dataset
+
+
+def compile_dataset_direct(filtered_reads,dataset_file):
+
+   strand_map = ['-','+']
+
+   SeqInfo = []
+   OriginalEsts = []
+   Qualities = []
+
+   instance_counter = 0
+
+   for line in open(filtered_reads):
+      line  = line.strip()
+      slist = line.split()
+      id    = int(slist[0])
+
+      if not id < 1000000300000:
+         continue
+
+      if instance_counter % 1000 == 0:
+         print 'processed %d examples' % instance_counter
+
+      chr      = int(slist[1])
+      strand   = slist[2]
+      strand   = strand_map[strand == 'D']
+
+      genomicSeq_start = int(slist[10]) - 1000
+      genomicSeq_stop  = int(slist[13] ) + 1000
+
+      original_est = slist[3]
+      original_est = original_est.lower()
+      #print original_est
+
+      prb      = [ord(elem)-50 for elem in slist[6]]
+      cal_prb  = [ord(elem)-64 for elem in slist[7]]
+      chastity = [ord(elem)+10 for elem in slist[8]]
+
+      #pdb.set_trace()
+
+      # add instance to set
+      SeqInfo.append((id,chr,strand,genomicSeq_start,genomicSeq_stop))
+      OriginalEsts.append(original_est)
+      Qualities.append( (prb,cal_prb,chastity) )
+
+      instance_counter += 1
+
+   dataset = [SeqInfo, OriginalEsts, Qualities ]
+
+   # saving new dataset
+   cPickle.dump(dataset,open(dataset_file,'w+'),protocol=2)
+
+
+
+
+def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
+   """
+   Now we want to use interval_query to get the predicted splice scores trained
+   on the TAIR sequence and annotation.
+   """
+
+   size = intervalEnd-intervalBegin
+   assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
+
+   acc = size*[0.0]
+   don = size*[0.0]
+
+   interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
+   pos_size = new_intp()
+   intp_assign(pos_size,1)
+
+   # fetch acceptor scores
+   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
+   acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
+
+   # fetch donor scores
+   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
+   don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
+
+   return acc, don
+
+
+def process_map(currentRRead,fRead):
+   """
+   For all matches found by Vmatch we calculate the fragment of the DNA we
+   would like to perform an aligment during prediction.
+   """
+
+   fragment_size = 3000
+
+   alternativeSeq = []
+
+   onePositiveLabel = False
+
+   rId      = currentRRead['id']
+   pos      = currentRRead['pos']
+   chr      = currentRRead['chr']
+   strand   = currentRRead['strand']
+
+   length   = currentRRead['length']
+   offset   = currentRRead['offset']
+
+   CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
+
+   # vmatch found the correct position
+   if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start']-36 <= pos <= fRead['p_stop']+36:
+      genomicSeq_start = fRead['p_start'] - CUT_OFFSET
+      genomicSeq_stop = fRead['p_stop'] + CUT_OFFSET
+   else:
+      genomicSeq_start = pos - (fragment_size/2)
+      genomicSeq_stop = pos + (fragment_size/2)
+
+   return (rId,chr,strand,genomicSeq_start,genomicSeq_stop)
+
+
+def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
+   """
+   This function expects an interval, chromosome and strand information and
+   returns then the genomic sequence of this interval and the associated scores.
+   """
+
+   assert genomicSeq_start < genomicSeq_stop
+
+   chrom         = 'chr%d' % chr
+   genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
+   genomicSeq = genomicSeq.lower()
+
+   # check the obtained dna sequence
+   assert genomicSeq != '', 'load_genomic returned empty sequence!'
+   
+   # all entries other than a c g t are set to n
+   no_base = re.compile('[^acgt]')
+   genomicSeq = no_base.sub('n',genomicSeq)
+
+   intervalBegin  = genomicSeq_start-100
+   intervalEnd    = genomicSeq_stop+100
+   seq_pos_offset = genomicSeq_start
+
+   currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
+
+   currentAcc = currentAcc[100:-98]
+   currentAcc = currentAcc[1:]
+   currentDon = currentDon[100:-100]
+
+   length = len(genomicSeq)
+   currentAcc = currentAcc[:length]
+
+   currentDon = currentDon+[-inf]*(length-len(currentDon))
+
+   ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
+   gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+
+   assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
+   assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
+   assert len(currentAcc) == len(currentDon)
+
+   return genomicSeq, currentAcc, currentDon
+
+
+def reverse_complement(seq):
+   map = {'a':'t','c':'g','g':'c','t':'a'}
+
+   new_seq = [map[elem] for elem in seq]
+   new_seq.reverse()
+   new_seq = "".join(new_seq)
+
+   return new_seq
+
+
+def get_seq(begin,end,exon_end):
+   """
+   """
+
+   dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+
+   if exon_end:
+      gene_start = begin
+      gene_stop  = end+2
+   else:
+      gene_start = begin-2
+      gene_stop  = end
+
+   chrom         = 'chr%d' % 1
+   strand        = '+'
+
+   genomicSeq = load_genomic(chrom,strand,gene_start,gene_stop,dna_flat_files,one_based=False)
+   genomicSeq = genomicSeq.lower()
+
+   return genomicSeq
+
+
+def parseLine(line):
+   """
+   We assume that a line has the following entries:
+   
+   read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
+
+   """
+   #try:
+   id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = line.split()
+   #except:
+   #   id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
+   #   true_cut = -1
+
+   splitpos = int(splitpos)
+   read_size = int(read_size)
+
+   seq=seq.lower()
+
+   assert strand in ['D','P']
+
+   if strand == 'D':
+      strand = '+'
+
+   if strand == 'P':
+      strand = '-'
+
+   chr = int(chr)
+
+   prb = [ord(elem)-50 for elem in prb]
+   cal_prb = [ord(elem)-64 for elem in cal_prb]
+   chastity = [ord(elem)+10 for elem in chastity]
+
+   p_start = int(p_start)
+   exon_stop = int(exon_stop)
+   exon_start = int(exon_start)
+   p_stop = int(p_stop)
+   true_cut = int(true_cut)
+
+   line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
+   'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
+   'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
+   'p_stop':p_stop,'true_cut':true_cut}
+
+   return line_d
+
+
+if __name__ == '__main__':
+   if len(sys.argv) == 1:
+      print info
+
+   assert len(sys.argv) == 6, help
+   compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)
diff --git a/tools/qpalma_dataset_generation/createNewDataset.py b/tools/qpalma_dataset_generation/createNewDataset.py
new file mode 100644 (file)
index 0000000..fa64382
--- /dev/null
@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import qpalma.Configuration as Conf
+from compile_dataset import DatasetGenerator
+
+filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/allReads.full'
+
+map_1_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/spliced.heuristic'
+map_2_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/map_2nd.vm'
+
+dg = DatasetGenerator(filtered_fn,map_1_fn,map_2_fn)
+#dg.compile_training_set()
+dg.compile_testing_set()
+dg.saveAs('dataset_19_05_08')