+ moved tools from training dataset generation to their own folder
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 3 Jul 2008 10:53:48 +0000 (10:53 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 3 Jul 2008 10:53:48 +0000 (10:53 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9854 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/data_tools/filterReads.c
tools/mappingInfo.sh [deleted file]
tools/qpalma_dataset_generation/compile_dataset.py [deleted file]
tools/qpalma_dataset_generation/createNewDataset.py [deleted file]
tools/run_specific_scripts/training_dataset/compile_dataset.py [new file with mode: 0644]
tools/run_specific_scripts/training_dataset/createNewDataset.py [new file with mode: 0644]

index ae84946..132f3dc 100644 (file)
@@ -91,8 +91,6 @@ static char *info = "Usage is:\n./filterReads gff reads output";
 
 int combined_reads = 0;
 
-char current_strand;
-
 /*
  *
  *
@@ -186,34 +184,28 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
    int read_start, read_stop;
    // start of the parsing loop 
   
-   //static char *all_strands = "+-";
-   //static char *all_strands = "DP";
-   //int strand_idx;
-   //for(strand_idx=0;strand_idx<2;strand_idx++) {
-   //   char current_strand = all_strands[strand_idx];
-
       while(1) {
          if (gene_idx == numGenes || strcmp(current_line,"") == 0)
             break;
 
          gene_id = currentGene->id;
 
-         if (readCtr != 0 && readCtr % 1000000 == 0) {
-            printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
-            printf("%d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
-            printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
-            printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
-            printf("\t%d useless reads\n",uselessReadCtr);
-            printf("\t%d skipped reads\n",skippedReadCtr);
-            printf("\t%d multioccurring\n",multioccurReadCtr);
-            printf("\t%d wrong_strand\n",wrong_strand_ctr);
-            printf("%d within gene\n",read_within_gene_ctr);
-            printf("%d outside gene\n",read_outside_gene_ctr);
-            printf("%d reads were totally exonic\n",exonicReadCtr);
-            printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
-            printf("%d reads where newly combined from two original reads\n",combined_reads);
-            printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
-         }
+         //if (readCtr != 0 && readCtr % 1000000 == 0) {
+         //   printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
+         //   printf("%d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
+         //   printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
+         //   printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
+         //   printf("\t%d useless reads\n",uselessReadCtr);
+         //   printf("\t%d skipped reads\n",skippedReadCtr);
+         //   printf("\t%d multioccurring\n",multioccurReadCtr);
+         //   printf("\t%d wrong_strand\n",wrong_strand_ctr);
+         //   printf("%d within gene\n",read_within_gene_ctr);
+         //   printf("%d outside gene\n",read_outside_gene_ctr);
+         //   printf("%d reads were totally exonic\n",exonicReadCtr);
+         //   printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
+         //   printf("%d reads where newly combined from two original reads\n",combined_reads);
+         //   printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
+         //}
 
          // pos of the reads is one-based
          status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
@@ -232,11 +224,6 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
          read_start = pos;
          read_stop  = pos + read_size-1;
 
-         if( strand != current_strand ) {
-            wrong_strand_ctr++;
-            goto next_read;
-         }
-
          FA(strlen(seq) >= read_size);
 
          FA(currentGene != 0);
@@ -250,7 +237,7 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
                gene_idx++;
                exon_idx = 1;
                currentGene = (*allGenes)[gene_idx];
-               while( (currentGene == 0 || currentGene->strand != current_strand) && gene_idx < numGenes) {
+               while( currentGene == 0 && gene_idx < numGenes) {
                   currentGene = (*allGenes)[gene_idx];
                   gene_idx++;
                }
@@ -615,7 +602,7 @@ void open_file(const char* filename, const char* mode, FILE** fs) {
 
 int main(int argc, char* argv[]) {
 
-   if(argc != 8) {
+   if(argc != 7) {
       printf("%s\n",info);
       exit(EXIT_FAILURE);
    }
@@ -624,9 +611,7 @@ int main(int argc, char* argv[]) {
    int filenameSize = 256;
    char* gff_filename = malloc(sizeof(char)*filenameSize);
 
-   current_strand = argv[1][0];
-   printf("current strand is %c\n",current_strand);
-   strncpy(gff_filename,argv[2],filenameSize);
+   strncpy(gff_filename,argv[1],filenameSize);
 
    FILE *gff_fs;
    FILE *reads_fs;
@@ -634,15 +619,15 @@ int main(int argc, char* argv[]) {
    FILE *unfiltered_out_fs;
 
    // open file streams for all needed input/output files
-   open_file(argv[2],"r",&gff_fs);
-   open_file(argv[3],"r",&reads_fs);
-   open_file(argv[4],"w",&out_fs);
-   open_file(argv[5],"w",&unfiltered_out_fs);
+   open_file(argv[1],"r",&gff_fs);
+   open_file(argv[2],"r",&reads_fs);
+   open_file(argv[3],"w",&out_fs);
+   open_file(argv[4],"w",&unfiltered_out_fs);
 
-   read_nr = strtoul(argv[6],NULL,10);
+   read_nr = strtoul(argv[5],NULL,10);
    read_nr++;
 
-   unfiltered_read_nr = strtoul(argv[7],NULL,10);
+   unfiltered_read_nr = strtoul(argv[6],NULL,10);
    unfiltered_read_nr++;
 
    // allocate and load all genes and then close the gff file stream
@@ -668,6 +653,7 @@ int main(int argc, char* argv[]) {
          printf("Invalid positions for gene %s!\n",currentGene->id);
 
       if (! (old_gene_stop <  currentGene->start ) ) {
+         printf("Gene %s is overlapping\n",currentGene->id);
          old_gene_stop = currentGene->stop;
          allGenes[g_idx] = 0;
          nulled_genes++;
diff --git a/tools/mappingInfo.sh b/tools/mappingInfo.sh
deleted file mode 100755 (executable)
index f8b34bc..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-dir=$1
-
-num=`cut -f1 $dir/main/reads_0.fl | sort -u | wc -l`
-echo "Started 1st vmatch run with $num reads."
-
-num=`cut -f1 $dir/main/map.vm | sort -u | wc -l`
-echo "Found $num reads in 1st vmatch run."
-
-num=`cut -f1 $dir/spliced/reads_0.fl | sort -u | wc -l`
-echo "Started 2nd vmatch run with $num reads."
-
-num=`cut -f1 $dir/spliced/map.vm | sort -u | wc -l`
-echo "Found $num reads in 2nd vmatch run"
diff --git a/tools/qpalma_dataset_generation/compile_dataset.py b/tools/qpalma_dataset_generation/compile_dataset.py
deleted file mode 100644 (file)
index a125664..0000000
+++ /dev/null
@@ -1,435 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import sys
-import random
-import os
-import re
-import pdb
-import cPickle
-
-import numpy
-from numpy.matlib import mat,zeros,ones,inf
-
-import qpalma
-import qpalma.tools
-
-from qpalma.parsers import *
-
-from Genefinding import *
-
-from genome_utils import load_genomic
-
-import qpalma.Configuration as Conf
-
-
-class DatasetGenerator:
-   """
-
-   """
-   
-   def __init__(self,filtered_reads,map_file,map_2nd_file):
-      assert os.path.exists(filtered_reads), 'Error: Can not find reads file'
-      self.filtered_reads = filtered_reads
-
-      assert os.path.exists(map_file), 'Error: Can not find map file'
-      assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
-      self.map_file = map_file
-      self.map_2nd_file = map_2nd_file
-
-      self.training_set = []
-      self.testing_set  = []
-
-      print 'parsing filtered reads..'
-      self.all_filtered_reads = parse_filtered_reads(self.filtered_reads)
-      print 'found %d filtered reads' % len(self.all_filtered_reads)
-
-
-   def saveAs(self,dataset_file):
-      assert not os.path.exists(dataset_file), 'The data_file already exists!'
-
-      # saving new datasets
-
-      #all_keys = self.training_set.keys()
-      #random.shuffle(all_keys)
-      #training_keys = all_keys[0:10000]
-      #cPickle.dump(self.training_set,open('%s.train.pickle'%dataset_file,'w+'),protocol=2)
-      #cPickle.dump(training_keys,open('%s.train_keys.pickle'%dataset_file,'w+'),protocol=2)
-
-      cPickle.dump(self.testing_set,open('%s.test.pickle'%dataset_file,'w+'),protocol=2)
-
-      prediction_keys = [0]*len(self.testing_set.keys())
-      for pos,key in enumerate(self.testing_set.keys()):
-         prediction_keys[pos] = key
-
-      cPickle.dump(prediction_keys,open('%s.test_keys.pickle'%dataset_file,'w+'),protocol=2)
-
-
-   def compile_training_set(self):
-      # this stores the new dataset
-      dataset = {}
-
-      # Iterate over all remapped reads in order to generate for each read a
-      # training / prediction example
-      instance_counter = 1
-      skipped_ctr = 0
-
-      for id,filteredRead in self.all_filtered_reads.items():
-         if instance_counter % 1001 == 0:
-            print 'processed %d examples' % instance_counter
-
-         # training set consists only of spliced reads
-         if not id < 1000000300000:
-            continue 
-
-         if filteredRead['strand'] != '+':
-            skipped_ctr += 1
-            continue
-
-         if not filteredRead['chr'] in range(1,6):
-            skipped_ctr += 1
-            continue
-
-         # we cut out the genomic region for training 
-         CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
-         genomicSeq_start  = filteredRead['p_start'] - CUT_OFFSET
-         genomicSeq_stop   = filteredRead['p_stop'] + CUT_OFFSET
-
-         # information of the full read we need for training
-         chromo         = filteredRead['chr']
-         strand         = filteredRead['strand']
-         original_est   = filteredRead['seq']
-         quality        = filteredRead['prb']
-         cal_prb        = filteredRead['cal_prb']
-         chastity       = filteredRead['chastity']
-
-         currentExons = zeros((2,2),dtype=numpy.int)
-         currentExons[0,0] = filteredRead['p_start']
-         currentExons[0,1] = filteredRead['exon_stop']
-         currentExons[1,0] = filteredRead['exon_start']
-         currentExons[1,1] = filteredRead['p_stop']
-
-         # add instance to set
-         currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
-         currentQualities = (quality,cal_prb,chastity)
-
-         dataset[id] = (currentSeqInfo,currentExons,original_est,currentQualities)
-
-         instance_counter += 1
-
-      print 'Full dataset has size %d' % len(dataset)
-      print 'Skipped %d reads' % skipped_ctr
-
-      self.training_set = dataset
-      
-
-   def parse_map_file(self,dataset,map_file):
-      strand_map = ['-','+']
-      instance_counter = 1
-
-      for line in open(map_file):
-         if instance_counter % 1001 == 0:
-            print 'processed %d examples' % instance_counter
-
-         line  = line.strip()
-         slist = line.split()
-         id    = int(slist[0])
-         chromo   = int(slist[1])
-         pos      = int(slist[2])
-         strand   = slist[3]
-         strand   = strand_map[strand == 'D']
-
-         if strand != '+':
-            continue
-
-         if not chromo in range(1,6):
-            continue
-
-         genomicSeq_start = pos - 1500
-         genomicSeq_stop  = pos + 1500
-
-         # fetch missing information from original reads
-         filteredRead = self.all_filtered_reads[id]
-         original_est = filteredRead['seq']
-         original_est = original_est.lower()
-
-         original_est = filteredRead['seq']
-         prb     = filteredRead['prb']
-         cal_prb = filteredRead['cal_prb']
-         chastity = filteredRead['chastity']
-
-         # add instance to set
-         currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
-         currentQualities = (prb,cal_prb,chastity)
-
-         # as one single read can have several vmatch matches we store all
-         # these matches under the unique id of the read
-         if dataset.has_key(id):
-            old_entry = dataset[id]
-            old_entry.append((currentSeqInfo,original_est,currentQualities))
-            dataset[id] = old_entry
-         else:
-            dataset[id] = [(currentSeqInfo,original_est,currentQualities)]
-
-         instance_counter += 1
-
-      print 'Total examples processed: %d' % instance_counter
-
-      return dataset
-
-
-   def compile_testing_set(self):
-
-      dataset = {}
-
-      # usually we have two files to parse:
-      # the map file from the second run and a subset of the map file from the
-      # first run
-      dataset = self.parse_map_file(dataset,self.map_file)
-      dataset = self.parse_map_file(dataset,self.map_2nd_file)
-
-      # store the full set
-      self.testing_set = dataset
-
-
-def compile_dataset_direct(filtered_reads,dataset_file):
-
-   strand_map = ['-','+']
-
-   SeqInfo = []
-   OriginalEsts = []
-   Qualities = []
-
-   instance_counter = 0
-
-   for line in open(filtered_reads):
-      line  = line.strip()
-      slist = line.split()
-      id    = int(slist[0])
-
-      if not id < 1000000300000:
-         continue
-
-      if instance_counter % 1000 == 0:
-         print 'processed %d examples' % instance_counter
-
-      chr      = int(slist[1])
-      strand   = slist[2]
-      strand   = strand_map[strand == 'D']
-
-      genomicSeq_start = int(slist[10]) - 1000
-      genomicSeq_stop  = int(slist[13] ) + 1000
-
-      original_est = slist[3]
-      original_est = original_est.lower()
-      #print original_est
-
-      prb      = [ord(elem)-50 for elem in slist[6]]
-      cal_prb  = [ord(elem)-64 for elem in slist[7]]
-      chastity = [ord(elem)+10 for elem in slist[8]]
-
-      #pdb.set_trace()
-
-      # add instance to set
-      SeqInfo.append((id,chr,strand,genomicSeq_start,genomicSeq_stop))
-      OriginalEsts.append(original_est)
-      Qualities.append( (prb,cal_prb,chastity) )
-
-      instance_counter += 1
-
-   dataset = [SeqInfo, OriginalEsts, Qualities ]
-
-   # saving new dataset
-   cPickle.dump(dataset,open(dataset_file,'w+'),protocol=2)
-
-
-
-
-def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
-   """
-   Now we want to use interval_query to get the predicted splice scores trained
-   on the TAIR sequence and annotation.
-   """
-
-   size = intervalEnd-intervalBegin
-   assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
-
-   acc = size*[0.0]
-   don = size*[0.0]
-
-   interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
-   pos_size = new_intp()
-   intp_assign(pos_size,1)
-
-   # fetch acceptor scores
-   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
-   acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
-
-   # fetch donor scores
-   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
-   don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
-
-   return acc, don
-
-
-def process_map(currentRRead,fRead):
-   """
-   For all matches found by Vmatch we calculate the fragment of the DNA we
-   would like to perform an aligment during prediction.
-   """
-
-   fragment_size = 3000
-
-   alternativeSeq = []
-
-   onePositiveLabel = False
-
-   rId      = currentRRead['id']
-   pos      = currentRRead['pos']
-   chr      = currentRRead['chr']
-   strand   = currentRRead['strand']
-
-   length   = currentRRead['length']
-   offset   = currentRRead['offset']
-
-   CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
-
-   # vmatch found the correct position
-   if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start']-36 <= pos <= fRead['p_stop']+36:
-      genomicSeq_start = fRead['p_start'] - CUT_OFFSET
-      genomicSeq_stop = fRead['p_stop'] + CUT_OFFSET
-   else:
-      genomicSeq_start = pos - (fragment_size/2)
-      genomicSeq_stop = pos + (fragment_size/2)
-
-   return (rId,chr,strand,genomicSeq_start,genomicSeq_stop)
-
-
-def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
-   """
-   This function expects an interval, chromosome and strand information and
-   returns then the genomic sequence of this interval and the associated scores.
-   """
-
-   assert genomicSeq_start < genomicSeq_stop
-
-   chrom         = 'chr%d' % chr
-   genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
-   genomicSeq = genomicSeq.lower()
-
-   # check the obtained dna sequence
-   assert genomicSeq != '', 'load_genomic returned empty sequence!'
-   
-   # all entries other than a c g t are set to n
-   no_base = re.compile('[^acgt]')
-   genomicSeq = no_base.sub('n',genomicSeq)
-
-   intervalBegin  = genomicSeq_start-100
-   intervalEnd    = genomicSeq_stop+100
-   seq_pos_offset = genomicSeq_start
-
-   currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
-
-   currentAcc = currentAcc[100:-98]
-   currentAcc = currentAcc[1:]
-   currentDon = currentDon[100:-100]
-
-   length = len(genomicSeq)
-   currentAcc = currentAcc[:length]
-
-   currentDon = currentDon+[-inf]*(length-len(currentDon))
-
-   ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
-   gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
-
-   assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
-   assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
-   assert len(currentAcc) == len(currentDon)
-
-   return genomicSeq, currentAcc, currentDon
-
-
-def reverse_complement(seq):
-   map = {'a':'t','c':'g','g':'c','t':'a'}
-
-   new_seq = [map[elem] for elem in seq]
-   new_seq.reverse()
-   new_seq = "".join(new_seq)
-
-   return new_seq
-
-
-def get_seq(begin,end,exon_end):
-   """
-   """
-
-   dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-
-   if exon_end:
-      gene_start = begin
-      gene_stop  = end+2
-   else:
-      gene_start = begin-2
-      gene_stop  = end
-
-   chrom         = 'chr%d' % 1
-   strand        = '+'
-
-   genomicSeq = load_genomic(chrom,strand,gene_start,gene_stop,dna_flat_files,one_based=False)
-   genomicSeq = genomicSeq.lower()
-
-   return genomicSeq
-
-
-def parseLine(line):
-   """
-   We assume that a line has the following entries:
-   
-   read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
-
-   """
-   #try:
-   id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = line.split()
-   #except:
-   #   id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
-   #   true_cut = -1
-
-   splitpos = int(splitpos)
-   read_size = int(read_size)
-
-   seq=seq.lower()
-
-   assert strand in ['D','P']
-
-   if strand == 'D':
-      strand = '+'
-
-   if strand == 'P':
-      strand = '-'
-
-   chr = int(chr)
-
-   prb = [ord(elem)-50 for elem in prb]
-   cal_prb = [ord(elem)-64 for elem in cal_prb]
-   chastity = [ord(elem)+10 for elem in chastity]
-
-   p_start = int(p_start)
-   exon_stop = int(exon_stop)
-   exon_start = int(exon_start)
-   p_stop = int(p_stop)
-   true_cut = int(true_cut)
-
-   line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
-   'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
-   'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
-   'p_stop':p_stop,'true_cut':true_cut}
-
-   return line_d
-
-
-if __name__ == '__main__':
-   if len(sys.argv) == 1:
-      print info
-
-   assert len(sys.argv) == 6, help
-   compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)
diff --git a/tools/qpalma_dataset_generation/createNewDataset.py b/tools/qpalma_dataset_generation/createNewDataset.py
deleted file mode 100644 (file)
index fa64382..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import qpalma.Configuration as Conf
-from compile_dataset import DatasetGenerator
-
-filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/allReads.full'
-
-map_1_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/spliced.heuristic'
-map_2_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/map_2nd.vm'
-
-dg = DatasetGenerator(filtered_fn,map_1_fn,map_2_fn)
-#dg.compile_training_set()
-dg.compile_testing_set()
-dg.saveAs('dataset_19_05_08')
diff --git a/tools/run_specific_scripts/training_dataset/compile_dataset.py b/tools/run_specific_scripts/training_dataset/compile_dataset.py
new file mode 100644 (file)
index 0000000..a125664
--- /dev/null
@@ -0,0 +1,435 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import random
+import os
+import re
+import pdb
+import cPickle
+
+import numpy
+from numpy.matlib import mat,zeros,ones,inf
+
+import qpalma
+import qpalma.tools
+
+from qpalma.parsers import *
+
+from Genefinding import *
+
+from genome_utils import load_genomic
+
+import qpalma.Configuration as Conf
+
+
+class DatasetGenerator:
+   """
+
+   """
+   
+   def __init__(self,filtered_reads,map_file,map_2nd_file):
+      assert os.path.exists(filtered_reads), 'Error: Can not find reads file'
+      self.filtered_reads = filtered_reads
+
+      assert os.path.exists(map_file), 'Error: Can not find map file'
+      assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
+      self.map_file = map_file
+      self.map_2nd_file = map_2nd_file
+
+      self.training_set = []
+      self.testing_set  = []
+
+      print 'parsing filtered reads..'
+      self.all_filtered_reads = parse_filtered_reads(self.filtered_reads)
+      print 'found %d filtered reads' % len(self.all_filtered_reads)
+
+
+   def saveAs(self,dataset_file):
+      assert not os.path.exists(dataset_file), 'The data_file already exists!'
+
+      # saving new datasets
+
+      #all_keys = self.training_set.keys()
+      #random.shuffle(all_keys)
+      #training_keys = all_keys[0:10000]
+      #cPickle.dump(self.training_set,open('%s.train.pickle'%dataset_file,'w+'),protocol=2)
+      #cPickle.dump(training_keys,open('%s.train_keys.pickle'%dataset_file,'w+'),protocol=2)
+
+      cPickle.dump(self.testing_set,open('%s.test.pickle'%dataset_file,'w+'),protocol=2)
+
+      prediction_keys = [0]*len(self.testing_set.keys())
+      for pos,key in enumerate(self.testing_set.keys()):
+         prediction_keys[pos] = key
+
+      cPickle.dump(prediction_keys,open('%s.test_keys.pickle'%dataset_file,'w+'),protocol=2)
+
+
+   def compile_training_set(self):
+      # this stores the new dataset
+      dataset = {}
+
+      # Iterate over all remapped reads in order to generate for each read a
+      # training / prediction example
+      instance_counter = 1
+      skipped_ctr = 0
+
+      for id,filteredRead in self.all_filtered_reads.items():
+         if instance_counter % 1001 == 0:
+            print 'processed %d examples' % instance_counter
+
+         # training set consists only of spliced reads
+         if not id < 1000000300000:
+            continue 
+
+         if filteredRead['strand'] != '+':
+            skipped_ctr += 1
+            continue
+
+         if not filteredRead['chr'] in range(1,6):
+            skipped_ctr += 1
+            continue
+
+         # we cut out the genomic region for training 
+         CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
+         genomicSeq_start  = filteredRead['p_start'] - CUT_OFFSET
+         genomicSeq_stop   = filteredRead['p_stop'] + CUT_OFFSET
+
+         # information of the full read we need for training
+         chromo         = filteredRead['chr']
+         strand         = filteredRead['strand']
+         original_est   = filteredRead['seq']
+         quality        = filteredRead['prb']
+         cal_prb        = filteredRead['cal_prb']
+         chastity       = filteredRead['chastity']
+
+         currentExons = zeros((2,2),dtype=numpy.int)
+         currentExons[0,0] = filteredRead['p_start']
+         currentExons[0,1] = filteredRead['exon_stop']
+         currentExons[1,0] = filteredRead['exon_start']
+         currentExons[1,1] = filteredRead['p_stop']
+
+         # add instance to set
+         currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
+         currentQualities = (quality,cal_prb,chastity)
+
+         dataset[id] = (currentSeqInfo,currentExons,original_est,currentQualities)
+
+         instance_counter += 1
+
+      print 'Full dataset has size %d' % len(dataset)
+      print 'Skipped %d reads' % skipped_ctr
+
+      self.training_set = dataset
+      
+
+   def parse_map_file(self,dataset,map_file):
+      strand_map = ['-','+']
+      instance_counter = 1
+
+      for line in open(map_file):
+         if instance_counter % 1001 == 0:
+            print 'processed %d examples' % instance_counter
+
+         line  = line.strip()
+         slist = line.split()
+         id    = int(slist[0])
+         chromo   = int(slist[1])
+         pos      = int(slist[2])
+         strand   = slist[3]
+         strand   = strand_map[strand == 'D']
+
+         if strand != '+':
+            continue
+
+         if not chromo in range(1,6):
+            continue
+
+         genomicSeq_start = pos - 1500
+         genomicSeq_stop  = pos + 1500
+
+         # fetch missing information from original reads
+         filteredRead = self.all_filtered_reads[id]
+         original_est = filteredRead['seq']
+         original_est = original_est.lower()
+
+         original_est = filteredRead['seq']
+         prb     = filteredRead['prb']
+         cal_prb = filteredRead['cal_prb']
+         chastity = filteredRead['chastity']
+
+         # add instance to set
+         currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
+         currentQualities = (prb,cal_prb,chastity)
+
+         # as one single read can have several vmatch matches we store all
+         # these matches under the unique id of the read
+         if dataset.has_key(id):
+            old_entry = dataset[id]
+            old_entry.append((currentSeqInfo,original_est,currentQualities))
+            dataset[id] = old_entry
+         else:
+            dataset[id] = [(currentSeqInfo,original_est,currentQualities)]
+
+         instance_counter += 1
+
+      print 'Total examples processed: %d' % instance_counter
+
+      return dataset
+
+
+   def compile_testing_set(self):
+
+      dataset = {}
+
+      # usually we have two files to parse:
+      # the map file from the second run and a subset of the map file from the
+      # first run
+      dataset = self.parse_map_file(dataset,self.map_file)
+      dataset = self.parse_map_file(dataset,self.map_2nd_file)
+
+      # store the full set
+      self.testing_set = dataset
+
+
+def compile_dataset_direct(filtered_reads,dataset_file):
+
+   strand_map = ['-','+']
+
+   SeqInfo = []
+   OriginalEsts = []
+   Qualities = []
+
+   instance_counter = 0
+
+   for line in open(filtered_reads):
+      line  = line.strip()
+      slist = line.split()
+      id    = int(slist[0])
+
+      if not id < 1000000300000:
+         continue
+
+      if instance_counter % 1000 == 0:
+         print 'processed %d examples' % instance_counter
+
+      chr      = int(slist[1])
+      strand   = slist[2]
+      strand   = strand_map[strand == 'D']
+
+      genomicSeq_start = int(slist[10]) - 1000
+      genomicSeq_stop  = int(slist[13] ) + 1000
+
+      original_est = slist[3]
+      original_est = original_est.lower()
+      #print original_est
+
+      prb      = [ord(elem)-50 for elem in slist[6]]
+      cal_prb  = [ord(elem)-64 for elem in slist[7]]
+      chastity = [ord(elem)+10 for elem in slist[8]]
+
+      #pdb.set_trace()
+
+      # add instance to set
+      SeqInfo.append((id,chr,strand,genomicSeq_start,genomicSeq_stop))
+      OriginalEsts.append(original_est)
+      Qualities.append( (prb,cal_prb,chastity) )
+
+      instance_counter += 1
+
+   dataset = [SeqInfo, OriginalEsts, Qualities ]
+
+   # saving new dataset
+   cPickle.dump(dataset,open(dataset_file,'w+'),protocol=2)
+
+
+
+
+def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
+   """
+   Now we want to use interval_query to get the predicted splice scores trained
+   on the TAIR sequence and annotation.
+   """
+
+   size = intervalEnd-intervalBegin
+   assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
+
+   acc = size*[0.0]
+   don = size*[0.0]
+
+   interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
+   pos_size = new_intp()
+   intp_assign(pos_size,1)
+
+   # fetch acceptor scores
+   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
+   acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
+
+   # fetch donor scores
+   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
+   don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
+
+   return acc, don
+
+
+def process_map(currentRRead,fRead):
+   """
+   For all matches found by Vmatch we calculate the fragment of the DNA we
+   would like to perform an aligment during prediction.
+   """
+
+   fragment_size = 3000
+
+   alternativeSeq = []
+
+   onePositiveLabel = False
+
+   rId      = currentRRead['id']
+   pos      = currentRRead['pos']
+   chr      = currentRRead['chr']
+   strand   = currentRRead['strand']
+
+   length   = currentRRead['length']
+   offset   = currentRRead['offset']
+
+   CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
+
+   # vmatch found the correct position
+   if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start']-36 <= pos <= fRead['p_stop']+36:
+      genomicSeq_start = fRead['p_start'] - CUT_OFFSET
+      genomicSeq_stop = fRead['p_stop'] + CUT_OFFSET
+   else:
+      genomicSeq_start = pos - (fragment_size/2)
+      genomicSeq_stop = pos + (fragment_size/2)
+
+   return (rId,chr,strand,genomicSeq_start,genomicSeq_stop)
+
+
+def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
+   """
+   This function expects an interval, chromosome and strand information and
+   returns then the genomic sequence of this interval and the associated scores.
+   """
+
+   assert genomicSeq_start < genomicSeq_stop
+
+   chrom         = 'chr%d' % chr
+   genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
+   genomicSeq = genomicSeq.lower()
+
+   # check the obtained dna sequence
+   assert genomicSeq != '', 'load_genomic returned empty sequence!'
+   
+   # all entries other than a c g t are set to n
+   no_base = re.compile('[^acgt]')
+   genomicSeq = no_base.sub('n',genomicSeq)
+
+   intervalBegin  = genomicSeq_start-100
+   intervalEnd    = genomicSeq_stop+100
+   seq_pos_offset = genomicSeq_start
+
+   currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
+
+   currentAcc = currentAcc[100:-98]
+   currentAcc = currentAcc[1:]
+   currentDon = currentDon[100:-100]
+
+   length = len(genomicSeq)
+   currentAcc = currentAcc[:length]
+
+   currentDon = currentDon+[-inf]*(length-len(currentDon))
+
+   ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
+   gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+
+   assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
+   assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
+   assert len(currentAcc) == len(currentDon)
+
+   return genomicSeq, currentAcc, currentDon
+
+
+def reverse_complement(seq):
+   map = {'a':'t','c':'g','g':'c','t':'a'}
+
+   new_seq = [map[elem] for elem in seq]
+   new_seq.reverse()
+   new_seq = "".join(new_seq)
+
+   return new_seq
+
+
+def get_seq(begin,end,exon_end):
+   """
+   """
+
+   dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+
+   if exon_end:
+      gene_start = begin
+      gene_stop  = end+2
+   else:
+      gene_start = begin-2
+      gene_stop  = end
+
+   chrom         = 'chr%d' % 1
+   strand        = '+'
+
+   genomicSeq = load_genomic(chrom,strand,gene_start,gene_stop,dna_flat_files,one_based=False)
+   genomicSeq = genomicSeq.lower()
+
+   return genomicSeq
+
+
+def parseLine(line):
+   """
+   We assume that a line has the following entries:
+   
+   read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
+
+   """
+   #try:
+   id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = line.split()
+   #except:
+   #   id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
+   #   true_cut = -1
+
+   splitpos = int(splitpos)
+   read_size = int(read_size)
+
+   seq=seq.lower()
+
+   assert strand in ['D','P']
+
+   if strand == 'D':
+      strand = '+'
+
+   if strand == 'P':
+      strand = '-'
+
+   chr = int(chr)
+
+   prb = [ord(elem)-50 for elem in prb]
+   cal_prb = [ord(elem)-64 for elem in cal_prb]
+   chastity = [ord(elem)+10 for elem in chastity]
+
+   p_start = int(p_start)
+   exon_stop = int(exon_stop)
+   exon_start = int(exon_start)
+   p_stop = int(p_stop)
+   true_cut = int(true_cut)
+
+   line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
+   'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
+   'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
+   'p_stop':p_stop,'true_cut':true_cut}
+
+   return line_d
+
+
+if __name__ == '__main__':
+   if len(sys.argv) == 1:
+      print info
+
+   assert len(sys.argv) == 6, help
+   compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)
diff --git a/tools/run_specific_scripts/training_dataset/createNewDataset.py b/tools/run_specific_scripts/training_dataset/createNewDataset.py
new file mode 100644 (file)
index 0000000..fa64382
--- /dev/null
@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import qpalma.Configuration as Conf
+from compile_dataset import DatasetGenerator
+
+filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/allReads.full'
+
+map_1_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/spliced.heuristic'
+map_2_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/map_2nd.vm'
+
+dg = DatasetGenerator(filtered_fn,map_1_fn,map_2_fn)
+#dg.compile_training_set()
+dg.compile_testing_set()
+dg.saveAs('dataset_19_05_08')