+ removed files
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 30 May 2008 09:53:36 +0000 (09:53 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 30 May 2008 09:53:36 +0000 (09:53 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9302 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/compile_dataset.py [deleted file]
scripts/createNewDataset.py [deleted file]

diff --git a/scripts/compile_dataset.py b/scripts/compile_dataset.py
deleted file mode 100644 (file)
index a125664..0000000
+++ /dev/null
@@ -1,435 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import sys
-import random
-import os
-import re
-import pdb
-import cPickle
-
-import numpy
-from numpy.matlib import mat,zeros,ones,inf
-
-import qpalma
-import qpalma.tools
-
-from qpalma.parsers import *
-
-from Genefinding import *
-
-from genome_utils import load_genomic
-
-import qpalma.Configuration as Conf
-
-
-class DatasetGenerator:
-   """
-
-   """
-   
-   def __init__(self,filtered_reads,map_file,map_2nd_file):
-      assert os.path.exists(filtered_reads), 'Error: Can not find reads file'
-      self.filtered_reads = filtered_reads
-
-      assert os.path.exists(map_file), 'Error: Can not find map file'
-      assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
-      self.map_file = map_file
-      self.map_2nd_file = map_2nd_file
-
-      self.training_set = []
-      self.testing_set  = []
-
-      print 'parsing filtered reads..'
-      self.all_filtered_reads = parse_filtered_reads(self.filtered_reads)
-      print 'found %d filtered reads' % len(self.all_filtered_reads)
-
-
-   def saveAs(self,dataset_file):
-      assert not os.path.exists(dataset_file), 'The data_file already exists!'
-
-      # saving new datasets
-
-      #all_keys = self.training_set.keys()
-      #random.shuffle(all_keys)
-      #training_keys = all_keys[0:10000]
-      #cPickle.dump(self.training_set,open('%s.train.pickle'%dataset_file,'w+'),protocol=2)
-      #cPickle.dump(training_keys,open('%s.train_keys.pickle'%dataset_file,'w+'),protocol=2)
-
-      cPickle.dump(self.testing_set,open('%s.test.pickle'%dataset_file,'w+'),protocol=2)
-
-      prediction_keys = [0]*len(self.testing_set.keys())
-      for pos,key in enumerate(self.testing_set.keys()):
-         prediction_keys[pos] = key
-
-      cPickle.dump(prediction_keys,open('%s.test_keys.pickle'%dataset_file,'w+'),protocol=2)
-
-
-   def compile_training_set(self):
-      # this stores the new dataset
-      dataset = {}
-
-      # Iterate over all remapped reads in order to generate for each read a
-      # training / prediction example
-      instance_counter = 1
-      skipped_ctr = 0
-
-      for id,filteredRead in self.all_filtered_reads.items():
-         if instance_counter % 1001 == 0:
-            print 'processed %d examples' % instance_counter
-
-         # training set consists only of spliced reads
-         if not id < 1000000300000:
-            continue 
-
-         if filteredRead['strand'] != '+':
-            skipped_ctr += 1
-            continue
-
-         if not filteredRead['chr'] in range(1,6):
-            skipped_ctr += 1
-            continue
-
-         # we cut out the genomic region for training 
-         CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
-         genomicSeq_start  = filteredRead['p_start'] - CUT_OFFSET
-         genomicSeq_stop   = filteredRead['p_stop'] + CUT_OFFSET
-
-         # information of the full read we need for training
-         chromo         = filteredRead['chr']
-         strand         = filteredRead['strand']
-         original_est   = filteredRead['seq']
-         quality        = filteredRead['prb']
-         cal_prb        = filteredRead['cal_prb']
-         chastity       = filteredRead['chastity']
-
-         currentExons = zeros((2,2),dtype=numpy.int)
-         currentExons[0,0] = filteredRead['p_start']
-         currentExons[0,1] = filteredRead['exon_stop']
-         currentExons[1,0] = filteredRead['exon_start']
-         currentExons[1,1] = filteredRead['p_stop']
-
-         # add instance to set
-         currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
-         currentQualities = (quality,cal_prb,chastity)
-
-         dataset[id] = (currentSeqInfo,currentExons,original_est,currentQualities)
-
-         instance_counter += 1
-
-      print 'Full dataset has size %d' % len(dataset)
-      print 'Skipped %d reads' % skipped_ctr
-
-      self.training_set = dataset
-      
-
-   def parse_map_file(self,dataset,map_file):
-      strand_map = ['-','+']
-      instance_counter = 1
-
-      for line in open(map_file):
-         if instance_counter % 1001 == 0:
-            print 'processed %d examples' % instance_counter
-
-         line  = line.strip()
-         slist = line.split()
-         id    = int(slist[0])
-         chromo   = int(slist[1])
-         pos      = int(slist[2])
-         strand   = slist[3]
-         strand   = strand_map[strand == 'D']
-
-         if strand != '+':
-            continue
-
-         if not chromo in range(1,6):
-            continue
-
-         genomicSeq_start = pos - 1500
-         genomicSeq_stop  = pos + 1500
-
-         # fetch missing information from original reads
-         filteredRead = self.all_filtered_reads[id]
-         original_est = filteredRead['seq']
-         original_est = original_est.lower()
-
-         original_est = filteredRead['seq']
-         prb     = filteredRead['prb']
-         cal_prb = filteredRead['cal_prb']
-         chastity = filteredRead['chastity']
-
-         # add instance to set
-         currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
-         currentQualities = (prb,cal_prb,chastity)
-
-         # as one single read can have several vmatch matches we store all
-         # these matches under the unique id of the read
-         if dataset.has_key(id):
-            old_entry = dataset[id]
-            old_entry.append((currentSeqInfo,original_est,currentQualities))
-            dataset[id] = old_entry
-         else:
-            dataset[id] = [(currentSeqInfo,original_est,currentQualities)]
-
-         instance_counter += 1
-
-      print 'Total examples processed: %d' % instance_counter
-
-      return dataset
-
-
-   def compile_testing_set(self):
-
-      dataset = {}
-
-      # usually we have two files to parse:
-      # the map file from the second run and a subset of the map file from the
-      # first run
-      dataset = self.parse_map_file(dataset,self.map_file)
-      dataset = self.parse_map_file(dataset,self.map_2nd_file)
-
-      # store the full set
-      self.testing_set = dataset
-
-
-def compile_dataset_direct(filtered_reads,dataset_file):
-
-   strand_map = ['-','+']
-
-   SeqInfo = []
-   OriginalEsts = []
-   Qualities = []
-
-   instance_counter = 0
-
-   for line in open(filtered_reads):
-      line  = line.strip()
-      slist = line.split()
-      id    = int(slist[0])
-
-      if not id < 1000000300000:
-         continue
-
-      if instance_counter % 1000 == 0:
-         print 'processed %d examples' % instance_counter
-
-      chr      = int(slist[1])
-      strand   = slist[2]
-      strand   = strand_map[strand == 'D']
-
-      genomicSeq_start = int(slist[10]) - 1000
-      genomicSeq_stop  = int(slist[13] ) + 1000
-
-      original_est = slist[3]
-      original_est = original_est.lower()
-      #print original_est
-
-      prb      = [ord(elem)-50 for elem in slist[6]]
-      cal_prb  = [ord(elem)-64 for elem in slist[7]]
-      chastity = [ord(elem)+10 for elem in slist[8]]
-
-      #pdb.set_trace()
-
-      # add instance to set
-      SeqInfo.append((id,chr,strand,genomicSeq_start,genomicSeq_stop))
-      OriginalEsts.append(original_est)
-      Qualities.append( (prb,cal_prb,chastity) )
-
-      instance_counter += 1
-
-   dataset = [SeqInfo, OriginalEsts, Qualities ]
-
-   # saving new dataset
-   cPickle.dump(dataset,open(dataset_file,'w+'),protocol=2)
-
-
-
-
-def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
-   """
-   Now we want to use interval_query to get the predicted splice scores trained
-   on the TAIR sequence and annotation.
-   """
-
-   size = intervalEnd-intervalBegin
-   assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
-
-   acc = size*[0.0]
-   don = size*[0.0]
-
-   interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
-   pos_size = new_intp()
-   intp_assign(pos_size,1)
-
-   # fetch acceptor scores
-   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
-   acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
-
-   # fetch donor scores
-   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
-   don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
-
-   return acc, don
-
-
-def process_map(currentRRead,fRead):
-   """
-   For all matches found by Vmatch we calculate the fragment of the DNA we
-   would like to perform an aligment during prediction.
-   """
-
-   fragment_size = 3000
-
-   alternativeSeq = []
-
-   onePositiveLabel = False
-
-   rId      = currentRRead['id']
-   pos      = currentRRead['pos']
-   chr      = currentRRead['chr']
-   strand   = currentRRead['strand']
-
-   length   = currentRRead['length']
-   offset   = currentRRead['offset']
-
-   CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
-
-   # vmatch found the correct position
-   if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start']-36 <= pos <= fRead['p_stop']+36:
-      genomicSeq_start = fRead['p_start'] - CUT_OFFSET
-      genomicSeq_stop = fRead['p_stop'] + CUT_OFFSET
-   else:
-      genomicSeq_start = pos - (fragment_size/2)
-      genomicSeq_stop = pos + (fragment_size/2)
-
-   return (rId,chr,strand,genomicSeq_start,genomicSeq_stop)
-
-
-def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
-   """
-   This function expects an interval, chromosome and strand information and
-   returns then the genomic sequence of this interval and the associated scores.
-   """
-
-   assert genomicSeq_start < genomicSeq_stop
-
-   chrom         = 'chr%d' % chr
-   genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
-   genomicSeq = genomicSeq.lower()
-
-   # check the obtained dna sequence
-   assert genomicSeq != '', 'load_genomic returned empty sequence!'
-   
-   # all entries other than a c g t are set to n
-   no_base = re.compile('[^acgt]')
-   genomicSeq = no_base.sub('n',genomicSeq)
-
-   intervalBegin  = genomicSeq_start-100
-   intervalEnd    = genomicSeq_stop+100
-   seq_pos_offset = genomicSeq_start
-
-   currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
-
-   currentAcc = currentAcc[100:-98]
-   currentAcc = currentAcc[1:]
-   currentDon = currentDon[100:-100]
-
-   length = len(genomicSeq)
-   currentAcc = currentAcc[:length]
-
-   currentDon = currentDon+[-inf]*(length-len(currentDon))
-
-   ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
-   gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
-
-   assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
-   assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
-   assert len(currentAcc) == len(currentDon)
-
-   return genomicSeq, currentAcc, currentDon
-
-
-def reverse_complement(seq):
-   map = {'a':'t','c':'g','g':'c','t':'a'}
-
-   new_seq = [map[elem] for elem in seq]
-   new_seq.reverse()
-   new_seq = "".join(new_seq)
-
-   return new_seq
-
-
-def get_seq(begin,end,exon_end):
-   """
-   """
-
-   dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-
-   if exon_end:
-      gene_start = begin
-      gene_stop  = end+2
-   else:
-      gene_start = begin-2
-      gene_stop  = end
-
-   chrom         = 'chr%d' % 1
-   strand        = '+'
-
-   genomicSeq = load_genomic(chrom,strand,gene_start,gene_stop,dna_flat_files,one_based=False)
-   genomicSeq = genomicSeq.lower()
-
-   return genomicSeq
-
-
-def parseLine(line):
-   """
-   We assume that a line has the following entries:
-   
-   read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
-
-   """
-   #try:
-   id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = line.split()
-   #except:
-   #   id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
-   #   true_cut = -1
-
-   splitpos = int(splitpos)
-   read_size = int(read_size)
-
-   seq=seq.lower()
-
-   assert strand in ['D','P']
-
-   if strand == 'D':
-      strand = '+'
-
-   if strand == 'P':
-      strand = '-'
-
-   chr = int(chr)
-
-   prb = [ord(elem)-50 for elem in prb]
-   cal_prb = [ord(elem)-64 for elem in cal_prb]
-   chastity = [ord(elem)+10 for elem in chastity]
-
-   p_start = int(p_start)
-   exon_stop = int(exon_stop)
-   exon_start = int(exon_start)
-   p_stop = int(p_stop)
-   true_cut = int(true_cut)
-
-   line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
-   'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
-   'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
-   'p_stop':p_stop,'true_cut':true_cut}
-
-   return line_d
-
-
-if __name__ == '__main__':
-   if len(sys.argv) == 1:
-      print info
-
-   assert len(sys.argv) == 6, help
-   compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)
diff --git a/scripts/createNewDataset.py b/scripts/createNewDataset.py
deleted file mode 100644 (file)
index fa64382..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import qpalma.Configuration as Conf
-from compile_dataset import DatasetGenerator
-
-filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/allReads.full'
-
-map_1_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/spliced.heuristic'
-map_2_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/map_2nd.vm'
-
-dg = DatasetGenerator(filtered_fn,map_1_fn,map_2_fn)
-#dg.compile_training_set()
-dg.compile_testing_set()
-dg.saveAs('dataset_19_05_08')