int combined_reads = 0;
-char current_strand;
-
/*
*
*
int read_start, read_stop;
// start of the parsing loop
- //static char *all_strands = "+-";
- //static char *all_strands = "DP";
- //int strand_idx;
- //for(strand_idx=0;strand_idx<2;strand_idx++) {
- // char current_strand = all_strands[strand_idx];
-
while(1) {
if (gene_idx == numGenes || strcmp(current_line,"") == 0)
break;
gene_id = currentGene->id;
- if (readCtr != 0 && readCtr % 1000000 == 0) {
- printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
- printf("%d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
- printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
- printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
- printf("\t%d useless reads\n",uselessReadCtr);
- printf("\t%d skipped reads\n",skippedReadCtr);
- printf("\t%d multioccurring\n",multioccurReadCtr);
- printf("\t%d wrong_strand\n",wrong_strand_ctr);
- printf("%d within gene\n",read_within_gene_ctr);
- printf("%d outside gene\n",read_outside_gene_ctr);
- printf("%d reads were totally exonic\n",exonicReadCtr);
- printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
- printf("%d reads where newly combined from two original reads\n",combined_reads);
- printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
- }
+ //if (readCtr != 0 && readCtr % 1000000 == 0) {
+ // printf("Processed %d/%d genes and %d/%d reads.\n",gene_idx,numGenes,readCtr,numReads);
+ // printf("%d reads in total while %d reads where skipped.\n",readCtr,skippedLinesCounter);
+ // printf("%d were totally intronic. %d were outside of genes and %d were occurring on several positions.\n",uselessReadCtr,skippedReadCtr,multioccurReadCtr);
+ // printf("%d reads were useless in total\n",uselessReadCtr+skippedReadCtr+multioccurReadCtr);
+ // printf("\t%d useless reads\n",uselessReadCtr);
+ // printf("\t%d skipped reads\n",skippedReadCtr);
+ // printf("\t%d multioccurring\n",multioccurReadCtr);
+ // printf("\t%d wrong_strand\n",wrong_strand_ctr);
+ // printf("%d within gene\n",read_within_gene_ctr);
+ // printf("%d outside gene\n",read_outside_gene_ctr);
+ // printf("%d reads were totally exonic\n",exonicReadCtr);
+ // printf("%d reads overlap with prev. exon. %d reads overlap with current exon\n",previousOverlapCtr,currentOverlapCtr);
+ // printf("%d reads where newly combined from two original reads\n",combined_reads);
+ // printf("Total used reads: %d (some may not be combined)\n",exonicReadCtr+combined_reads);
+ //}
// pos of the reads is one-based
status = sscanf(current_line,line_format,&chr,&pos,seq,&id,&strand,&mismatch,&occurrence,&size,&cut,prb,cal_prb,chastity);
read_start = pos;
read_stop = pos + read_size-1;
- if( strand != current_strand ) {
- wrong_strand_ctr++;
- goto next_read;
- }
-
FA(strlen(seq) >= read_size);
FA(currentGene != 0);
gene_idx++;
exon_idx = 1;
currentGene = (*allGenes)[gene_idx];
- while( (currentGene == 0 || currentGene->strand != current_strand) && gene_idx < numGenes) {
+ while( currentGene == 0 && gene_idx < numGenes) {
currentGene = (*allGenes)[gene_idx];
gene_idx++;
}
int main(int argc, char* argv[]) {
- if(argc != 8) {
+ if(argc != 7) {
printf("%s\n",info);
exit(EXIT_FAILURE);
}
int filenameSize = 256;
char* gff_filename = malloc(sizeof(char)*filenameSize);
- current_strand = argv[1][0];
- printf("current strand is %c\n",current_strand);
- strncpy(gff_filename,argv[2],filenameSize);
+ strncpy(gff_filename,argv[1],filenameSize);
FILE *gff_fs;
FILE *reads_fs;
FILE *unfiltered_out_fs;
// open file streams for all needed input/output files
- open_file(argv[2],"r",&gff_fs);
- open_file(argv[3],"r",&reads_fs);
- open_file(argv[4],"w",&out_fs);
- open_file(argv[5],"w",&unfiltered_out_fs);
+ open_file(argv[1],"r",&gff_fs);
+ open_file(argv[2],"r",&reads_fs);
+ open_file(argv[3],"w",&out_fs);
+ open_file(argv[4],"w",&unfiltered_out_fs);
- read_nr = strtoul(argv[6],NULL,10);
+ read_nr = strtoul(argv[5],NULL,10);
read_nr++;
- unfiltered_read_nr = strtoul(argv[7],NULL,10);
+ unfiltered_read_nr = strtoul(argv[6],NULL,10);
unfiltered_read_nr++;
// allocate and load all genes and then close the gff file stream
printf("Invalid positions for gene %s!\n",currentGene->id);
if (! (old_gene_stop < currentGene->start ) ) {
+ printf("Gene %s is overlapping\n",currentGene->id);
old_gene_stop = currentGene->stop;
allGenes[g_idx] = 0;
nulled_genes++;
+++ /dev/null
-#!/bin/bash
-
-dir=$1
-
-num=`cut -f1 $dir/main/reads_0.fl | sort -u | wc -l`
-echo "Started 1st vmatch run with $num reads."
-
-num=`cut -f1 $dir/main/map.vm | sort -u | wc -l`
-echo "Found $num reads in 1st vmatch run."
-
-num=`cut -f1 $dir/spliced/reads_0.fl | sort -u | wc -l`
-echo "Started 2nd vmatch run with $num reads."
-
-num=`cut -f1 $dir/spliced/map.vm | sort -u | wc -l`
-echo "Found $num reads in 2nd vmatch run"
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import sys
-import random
-import os
-import re
-import pdb
-import cPickle
-
-import numpy
-from numpy.matlib import mat,zeros,ones,inf
-
-import qpalma
-import qpalma.tools
-
-from qpalma.parsers import *
-
-from Genefinding import *
-
-from genome_utils import load_genomic
-
-import qpalma.Configuration as Conf
-
-
-class DatasetGenerator:
- """
-
- """
-
- def __init__(self,filtered_reads,map_file,map_2nd_file):
- assert os.path.exists(filtered_reads), 'Error: Can not find reads file'
- self.filtered_reads = filtered_reads
-
- assert os.path.exists(map_file), 'Error: Can not find map file'
- assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
- self.map_file = map_file
- self.map_2nd_file = map_2nd_file
-
- self.training_set = []
- self.testing_set = []
-
- print 'parsing filtered reads..'
- self.all_filtered_reads = parse_filtered_reads(self.filtered_reads)
- print 'found %d filtered reads' % len(self.all_filtered_reads)
-
-
- def saveAs(self,dataset_file):
- assert not os.path.exists(dataset_file), 'The data_file already exists!'
-
- # saving new datasets
-
- #all_keys = self.training_set.keys()
- #random.shuffle(all_keys)
- #training_keys = all_keys[0:10000]
- #cPickle.dump(self.training_set,open('%s.train.pickle'%dataset_file,'w+'),protocol=2)
- #cPickle.dump(training_keys,open('%s.train_keys.pickle'%dataset_file,'w+'),protocol=2)
-
- cPickle.dump(self.testing_set,open('%s.test.pickle'%dataset_file,'w+'),protocol=2)
-
- prediction_keys = [0]*len(self.testing_set.keys())
- for pos,key in enumerate(self.testing_set.keys()):
- prediction_keys[pos] = key
-
- cPickle.dump(prediction_keys,open('%s.test_keys.pickle'%dataset_file,'w+'),protocol=2)
-
-
- def compile_training_set(self):
- # this stores the new dataset
- dataset = {}
-
- # Iterate over all remapped reads in order to generate for each read a
- # training / prediction example
- instance_counter = 1
- skipped_ctr = 0
-
- for id,filteredRead in self.all_filtered_reads.items():
- if instance_counter % 1001 == 0:
- print 'processed %d examples' % instance_counter
-
- # training set consists only of spliced reads
- if not id < 1000000300000:
- continue
-
- if filteredRead['strand'] != '+':
- skipped_ctr += 1
- continue
-
- if not filteredRead['chr'] in range(1,6):
- skipped_ctr += 1
- continue
-
- # we cut out the genomic region for training
- CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
- genomicSeq_start = filteredRead['p_start'] - CUT_OFFSET
- genomicSeq_stop = filteredRead['p_stop'] + CUT_OFFSET
-
- # information of the full read we need for training
- chromo = filteredRead['chr']
- strand = filteredRead['strand']
- original_est = filteredRead['seq']
- quality = filteredRead['prb']
- cal_prb = filteredRead['cal_prb']
- chastity = filteredRead['chastity']
-
- currentExons = zeros((2,2),dtype=numpy.int)
- currentExons[0,0] = filteredRead['p_start']
- currentExons[0,1] = filteredRead['exon_stop']
- currentExons[1,0] = filteredRead['exon_start']
- currentExons[1,1] = filteredRead['p_stop']
-
- # add instance to set
- currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
- currentQualities = (quality,cal_prb,chastity)
-
- dataset[id] = (currentSeqInfo,currentExons,original_est,currentQualities)
-
- instance_counter += 1
-
- print 'Full dataset has size %d' % len(dataset)
- print 'Skipped %d reads' % skipped_ctr
-
- self.training_set = dataset
-
-
- def parse_map_file(self,dataset,map_file):
- strand_map = ['-','+']
- instance_counter = 1
-
- for line in open(map_file):
- if instance_counter % 1001 == 0:
- print 'processed %d examples' % instance_counter
-
- line = line.strip()
- slist = line.split()
- id = int(slist[0])
- chromo = int(slist[1])
- pos = int(slist[2])
- strand = slist[3]
- strand = strand_map[strand == 'D']
-
- if strand != '+':
- continue
-
- if not chromo in range(1,6):
- continue
-
- genomicSeq_start = pos - 1500
- genomicSeq_stop = pos + 1500
-
- # fetch missing information from original reads
- filteredRead = self.all_filtered_reads[id]
- original_est = filteredRead['seq']
- original_est = original_est.lower()
-
- original_est = filteredRead['seq']
- prb = filteredRead['prb']
- cal_prb = filteredRead['cal_prb']
- chastity = filteredRead['chastity']
-
- # add instance to set
- currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
- currentQualities = (prb,cal_prb,chastity)
-
- # as one single read can have several vmatch matches we store all
- # these matches under the unique id of the read
- if dataset.has_key(id):
- old_entry = dataset[id]
- old_entry.append((currentSeqInfo,original_est,currentQualities))
- dataset[id] = old_entry
- else:
- dataset[id] = [(currentSeqInfo,original_est,currentQualities)]
-
- instance_counter += 1
-
- print 'Total examples processed: %d' % instance_counter
-
- return dataset
-
-
- def compile_testing_set(self):
-
- dataset = {}
-
- # usually we have two files to parse:
- # the map file from the second run and a subset of the map file from the
- # first run
- dataset = self.parse_map_file(dataset,self.map_file)
- dataset = self.parse_map_file(dataset,self.map_2nd_file)
-
- # store the full set
- self.testing_set = dataset
-
-
-def compile_dataset_direct(filtered_reads,dataset_file):
-
- strand_map = ['-','+']
-
- SeqInfo = []
- OriginalEsts = []
- Qualities = []
-
- instance_counter = 0
-
- for line in open(filtered_reads):
- line = line.strip()
- slist = line.split()
- id = int(slist[0])
-
- if not id < 1000000300000:
- continue
-
- if instance_counter % 1000 == 0:
- print 'processed %d examples' % instance_counter
-
- chr = int(slist[1])
- strand = slist[2]
- strand = strand_map[strand == 'D']
-
- genomicSeq_start = int(slist[10]) - 1000
- genomicSeq_stop = int(slist[13] ) + 1000
-
- original_est = slist[3]
- original_est = original_est.lower()
- #print original_est
-
- prb = [ord(elem)-50 for elem in slist[6]]
- cal_prb = [ord(elem)-64 for elem in slist[7]]
- chastity = [ord(elem)+10 for elem in slist[8]]
-
- #pdb.set_trace()
-
- # add instance to set
- SeqInfo.append((id,chr,strand,genomicSeq_start,genomicSeq_stop))
- OriginalEsts.append(original_est)
- Qualities.append( (prb,cal_prb,chastity) )
-
- instance_counter += 1
-
- dataset = [SeqInfo, OriginalEsts, Qualities ]
-
- # saving new dataset
- cPickle.dump(dataset,open(dataset_file,'w+'),protocol=2)
-
-
-
-
-def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
- """
- Now we want to use interval_query to get the predicted splice scores trained
- on the TAIR sequence and annotation.
- """
-
- size = intervalEnd-intervalBegin
- assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
-
- acc = size*[0.0]
- don = size*[0.0]
-
- interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
- pos_size = new_intp()
- intp_assign(pos_size,1)
-
- # fetch acceptor scores
- sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
- acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
-
- # fetch donor scores
- sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
- don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
-
- return acc, don
-
-
-def process_map(currentRRead,fRead):
- """
- For all matches found by Vmatch we calculate the fragment of the DNA we
- would like to perform an aligment during prediction.
- """
-
- fragment_size = 3000
-
- alternativeSeq = []
-
- onePositiveLabel = False
-
- rId = currentRRead['id']
- pos = currentRRead['pos']
- chr = currentRRead['chr']
- strand = currentRRead['strand']
-
- length = currentRRead['length']
- offset = currentRRead['offset']
-
- CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
-
- # vmatch found the correct position
- if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start']-36 <= pos <= fRead['p_stop']+36:
- genomicSeq_start = fRead['p_start'] - CUT_OFFSET
- genomicSeq_stop = fRead['p_stop'] + CUT_OFFSET
- else:
- genomicSeq_start = pos - (fragment_size/2)
- genomicSeq_stop = pos + (fragment_size/2)
-
- return (rId,chr,strand,genomicSeq_start,genomicSeq_stop)
-
-
-def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
- """
- This function expects an interval, chromosome and strand information and
- returns then the genomic sequence of this interval and the associated scores.
- """
-
- assert genomicSeq_start < genomicSeq_stop
-
- chrom = 'chr%d' % chr
- genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
- genomicSeq = genomicSeq.lower()
-
- # check the obtained dna sequence
- assert genomicSeq != '', 'load_genomic returned empty sequence!'
-
- # all entries other than a c g t are set to n
- no_base = re.compile('[^acgt]')
- genomicSeq = no_base.sub('n',genomicSeq)
-
- intervalBegin = genomicSeq_start-100
- intervalEnd = genomicSeq_stop+100
- seq_pos_offset = genomicSeq_start
-
- currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
-
- currentAcc = currentAcc[100:-98]
- currentAcc = currentAcc[1:]
- currentDon = currentDon[100:-100]
-
- length = len(genomicSeq)
- currentAcc = currentAcc[:length]
-
- currentDon = currentDon+[-inf]*(length-len(currentDon))
-
- ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
- gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
-
- assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
- assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
- assert len(currentAcc) == len(currentDon)
-
- return genomicSeq, currentAcc, currentDon
-
-
-def reverse_complement(seq):
- map = {'a':'t','c':'g','g':'c','t':'a'}
-
- new_seq = [map[elem] for elem in seq]
- new_seq.reverse()
- new_seq = "".join(new_seq)
-
- return new_seq
-
-
-def get_seq(begin,end,exon_end):
- """
- """
-
- dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-
- if exon_end:
- gene_start = begin
- gene_stop = end+2
- else:
- gene_start = begin-2
- gene_stop = end
-
- chrom = 'chr%d' % 1
- strand = '+'
-
- genomicSeq = load_genomic(chrom,strand,gene_start,gene_stop,dna_flat_files,one_based=False)
- genomicSeq = genomicSeq.lower()
-
- return genomicSeq
-
-
-def parseLine(line):
- """
- We assume that a line has the following entries:
-
- read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
-
- """
- #try:
- id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = line.split()
- #except:
- # id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
- # true_cut = -1
-
- splitpos = int(splitpos)
- read_size = int(read_size)
-
- seq=seq.lower()
-
- assert strand in ['D','P']
-
- if strand == 'D':
- strand = '+'
-
- if strand == 'P':
- strand = '-'
-
- chr = int(chr)
-
- prb = [ord(elem)-50 for elem in prb]
- cal_prb = [ord(elem)-64 for elem in cal_prb]
- chastity = [ord(elem)+10 for elem in chastity]
-
- p_start = int(p_start)
- exon_stop = int(exon_stop)
- exon_start = int(exon_start)
- p_stop = int(p_stop)
- true_cut = int(true_cut)
-
- line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
- 'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
- 'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
- 'p_stop':p_stop,'true_cut':true_cut}
-
- return line_d
-
-
-if __name__ == '__main__':
- if len(sys.argv) == 1:
- print info
-
- assert len(sys.argv) == 6, help
- compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import qpalma.Configuration as Conf
-from compile_dataset import DatasetGenerator
-
-filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/allReads.full'
-
-map_1_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/spliced.heuristic'
-map_2_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/map_2nd.vm'
-
-dg = DatasetGenerator(filtered_fn,map_1_fn,map_2_fn)
-#dg.compile_training_set()
-dg.compile_testing_set()
-dg.saveAs('dataset_19_05_08')
--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import random
+import os
+import re
+import pdb
+import cPickle
+
+import numpy
+from numpy.matlib import mat,zeros,ones,inf
+
+import qpalma
+import qpalma.tools
+
+from qpalma.parsers import *
+
+from Genefinding import *
+
+from genome_utils import load_genomic
+
+import qpalma.Configuration as Conf
+
+
+class DatasetGenerator:
+ """
+
+ """
+
+ def __init__(self,filtered_reads,map_file,map_2nd_file):
+ assert os.path.exists(filtered_reads), 'Error: Can not find reads file'
+ self.filtered_reads = filtered_reads
+
+ assert os.path.exists(map_file), 'Error: Can not find map file'
+ assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
+ self.map_file = map_file
+ self.map_2nd_file = map_2nd_file
+
+ self.training_set = []
+ self.testing_set = []
+
+ print 'parsing filtered reads..'
+ self.all_filtered_reads = parse_filtered_reads(self.filtered_reads)
+ print 'found %d filtered reads' % len(self.all_filtered_reads)
+
+
+ def saveAs(self,dataset_file):
+ assert not os.path.exists(dataset_file), 'The data_file already exists!'
+
+ # saving new datasets
+
+ #all_keys = self.training_set.keys()
+ #random.shuffle(all_keys)
+ #training_keys = all_keys[0:10000]
+ #cPickle.dump(self.training_set,open('%s.train.pickle'%dataset_file,'w+'),protocol=2)
+ #cPickle.dump(training_keys,open('%s.train_keys.pickle'%dataset_file,'w+'),protocol=2)
+
+ cPickle.dump(self.testing_set,open('%s.test.pickle'%dataset_file,'w+'),protocol=2)
+
+ prediction_keys = [0]*len(self.testing_set.keys())
+ for pos,key in enumerate(self.testing_set.keys()):
+ prediction_keys[pos] = key
+
+ cPickle.dump(prediction_keys,open('%s.test_keys.pickle'%dataset_file,'w+'),protocol=2)
+
+
+ def compile_training_set(self):
+ # this stores the new dataset
+ dataset = {}
+
+ # Iterate over all remapped reads in order to generate for each read a
+ # training / prediction example
+ instance_counter = 1
+ skipped_ctr = 0
+
+ for id,filteredRead in self.all_filtered_reads.items():
+ if instance_counter % 1001 == 0:
+ print 'processed %d examples' % instance_counter
+
+ # training set consists only of spliced reads
+ if not id < 1000000300000:
+ continue
+
+ if filteredRead['strand'] != '+':
+ skipped_ctr += 1
+ continue
+
+ if not filteredRead['chr'] in range(1,6):
+ skipped_ctr += 1
+ continue
+
+ # we cut out the genomic region for training
+ CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
+ genomicSeq_start = filteredRead['p_start'] - CUT_OFFSET
+ genomicSeq_stop = filteredRead['p_stop'] + CUT_OFFSET
+
+ # information of the full read we need for training
+ chromo = filteredRead['chr']
+ strand = filteredRead['strand']
+ original_est = filteredRead['seq']
+ quality = filteredRead['prb']
+ cal_prb = filteredRead['cal_prb']
+ chastity = filteredRead['chastity']
+
+ currentExons = zeros((2,2),dtype=numpy.int)
+ currentExons[0,0] = filteredRead['p_start']
+ currentExons[0,1] = filteredRead['exon_stop']
+ currentExons[1,0] = filteredRead['exon_start']
+ currentExons[1,1] = filteredRead['p_stop']
+
+ # add instance to set
+ currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
+ currentQualities = (quality,cal_prb,chastity)
+
+ dataset[id] = (currentSeqInfo,currentExons,original_est,currentQualities)
+
+ instance_counter += 1
+
+ print 'Full dataset has size %d' % len(dataset)
+ print 'Skipped %d reads' % skipped_ctr
+
+ self.training_set = dataset
+
+
+ def parse_map_file(self,dataset,map_file):
+ strand_map = ['-','+']
+ instance_counter = 1
+
+ for line in open(map_file):
+ if instance_counter % 1001 == 0:
+ print 'processed %d examples' % instance_counter
+
+ line = line.strip()
+ slist = line.split()
+ id = int(slist[0])
+ chromo = int(slist[1])
+ pos = int(slist[2])
+ strand = slist[3]
+ strand = strand_map[strand == 'D']
+
+ if strand != '+':
+ continue
+
+ if not chromo in range(1,6):
+ continue
+
+ genomicSeq_start = pos - 1500
+ genomicSeq_stop = pos + 1500
+
+ # fetch missing information from original reads
+ filteredRead = self.all_filtered_reads[id]
+ original_est = filteredRead['seq']
+ original_est = original_est.lower()
+
+ original_est = filteredRead['seq']
+ prb = filteredRead['prb']
+ cal_prb = filteredRead['cal_prb']
+ chastity = filteredRead['chastity']
+
+ # add instance to set
+ currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
+ currentQualities = (prb,cal_prb,chastity)
+
+ # as one single read can have several vmatch matches we store all
+ # these matches under the unique id of the read
+ if dataset.has_key(id):
+ old_entry = dataset[id]
+ old_entry.append((currentSeqInfo,original_est,currentQualities))
+ dataset[id] = old_entry
+ else:
+ dataset[id] = [(currentSeqInfo,original_est,currentQualities)]
+
+ instance_counter += 1
+
+ print 'Total examples processed: %d' % instance_counter
+
+ return dataset
+
+
+ def compile_testing_set(self):
+
+ dataset = {}
+
+ # usually we have two files to parse:
+ # the map file from the second run and a subset of the map file from the
+ # first run
+ dataset = self.parse_map_file(dataset,self.map_file)
+ dataset = self.parse_map_file(dataset,self.map_2nd_file)
+
+ # store the full set
+ self.testing_set = dataset
+
+
+def compile_dataset_direct(filtered_reads,dataset_file):
+
+ strand_map = ['-','+']
+
+ SeqInfo = []
+ OriginalEsts = []
+ Qualities = []
+
+ instance_counter = 0
+
+ for line in open(filtered_reads):
+ line = line.strip()
+ slist = line.split()
+ id = int(slist[0])
+
+ if not id < 1000000300000:
+ continue
+
+ if instance_counter % 1000 == 0:
+ print 'processed %d examples' % instance_counter
+
+ chr = int(slist[1])
+ strand = slist[2]
+ strand = strand_map[strand == 'D']
+
+ genomicSeq_start = int(slist[10]) - 1000
+ genomicSeq_stop = int(slist[13] ) + 1000
+
+ original_est = slist[3]
+ original_est = original_est.lower()
+ #print original_est
+
+ prb = [ord(elem)-50 for elem in slist[6]]
+ cal_prb = [ord(elem)-64 for elem in slist[7]]
+ chastity = [ord(elem)+10 for elem in slist[8]]
+
+ #pdb.set_trace()
+
+ # add instance to set
+ SeqInfo.append((id,chr,strand,genomicSeq_start,genomicSeq_stop))
+ OriginalEsts.append(original_est)
+ Qualities.append( (prb,cal_prb,chastity) )
+
+ instance_counter += 1
+
+ dataset = [SeqInfo, OriginalEsts, Qualities ]
+
+ # saving new dataset
+ cPickle.dump(dataset,open(dataset_file,'w+'),protocol=2)
+
+
+
+
+def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
+ """
+ Now we want to use interval_query to get the predicted splice scores trained
+ on the TAIR sequence and annotation.
+ """
+
+ size = intervalEnd-intervalBegin
+ assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
+
+ acc = size*[0.0]
+ don = size*[0.0]
+
+ interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
+ pos_size = new_intp()
+ intp_assign(pos_size,1)
+
+ # fetch acceptor scores
+ sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
+ acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
+
+ # fetch donor scores
+ sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
+ don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
+
+ return acc, don
+
+
+def process_map(currentRRead,fRead):
+ """
+ For all matches found by Vmatch we calculate the fragment of the DNA we
+ would like to perform an aligment during prediction.
+ """
+
+ fragment_size = 3000
+
+ alternativeSeq = []
+
+ onePositiveLabel = False
+
+ rId = currentRRead['id']
+ pos = currentRRead['pos']
+ chr = currentRRead['chr']
+ strand = currentRRead['strand']
+
+ length = currentRRead['length']
+ offset = currentRRead['offset']
+
+ CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
+
+ # vmatch found the correct position
+ if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start']-36 <= pos <= fRead['p_stop']+36:
+ genomicSeq_start = fRead['p_start'] - CUT_OFFSET
+ genomicSeq_stop = fRead['p_stop'] + CUT_OFFSET
+ else:
+ genomicSeq_start = pos - (fragment_size/2)
+ genomicSeq_stop = pos + (fragment_size/2)
+
+ return (rId,chr,strand,genomicSeq_start,genomicSeq_stop)
+
+
+def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
+ """
+ This function expects an interval, chromosome and strand information and
+ returns then the genomic sequence of this interval and the associated scores.
+ """
+
+ assert genomicSeq_start < genomicSeq_stop
+
+ chrom = 'chr%d' % chr
+ genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
+ genomicSeq = genomicSeq.lower()
+
+ # check the obtained dna sequence
+ assert genomicSeq != '', 'load_genomic returned empty sequence!'
+
+ # all entries other than a c g t are set to n
+ no_base = re.compile('[^acgt]')
+ genomicSeq = no_base.sub('n',genomicSeq)
+
+ intervalBegin = genomicSeq_start-100
+ intervalEnd = genomicSeq_stop+100
+ seq_pos_offset = genomicSeq_start
+
+ currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
+
+ currentAcc = currentAcc[100:-98]
+ currentAcc = currentAcc[1:]
+ currentDon = currentDon[100:-100]
+
+ length = len(genomicSeq)
+ currentAcc = currentAcc[:length]
+
+ currentDon = currentDon+[-inf]*(length-len(currentDon))
+
+ ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
+ gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
+
+ assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
+ assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
+ assert len(currentAcc) == len(currentDon)
+
+ return genomicSeq, currentAcc, currentDon
+
+
+def reverse_complement(seq):
+ map = {'a':'t','c':'g','g':'c','t':'a'}
+
+ new_seq = [map[elem] for elem in seq]
+ new_seq.reverse()
+ new_seq = "".join(new_seq)
+
+ return new_seq
+
+
+def get_seq(begin,end,exon_end):
+ """
+ """
+
+ dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+
+ if exon_end:
+ gene_start = begin
+ gene_stop = end+2
+ else:
+ gene_start = begin-2
+ gene_stop = end
+
+ chrom = 'chr%d' % 1
+ strand = '+'
+
+ genomicSeq = load_genomic(chrom,strand,gene_start,gene_stop,dna_flat_files,one_based=False)
+ genomicSeq = genomicSeq.lower()
+
+ return genomicSeq
+
+
+def parseLine(line):
+ """
+ We assume that a line has the following entries:
+
+ read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
+
+ """
+ #try:
+ id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = line.split()
+ #except:
+ # id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
+ # true_cut = -1
+
+ splitpos = int(splitpos)
+ read_size = int(read_size)
+
+ seq=seq.lower()
+
+ assert strand in ['D','P']
+
+ if strand == 'D':
+ strand = '+'
+
+ if strand == 'P':
+ strand = '-'
+
+ chr = int(chr)
+
+ prb = [ord(elem)-50 for elem in prb]
+ cal_prb = [ord(elem)-64 for elem in cal_prb]
+ chastity = [ord(elem)+10 for elem in chastity]
+
+ p_start = int(p_start)
+ exon_stop = int(exon_stop)
+ exon_start = int(exon_start)
+ p_stop = int(p_stop)
+ true_cut = int(true_cut)
+
+ line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
+ 'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
+ 'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
+ 'p_stop':p_stop,'true_cut':true_cut}
+
+ return line_d
+
+
+if __name__ == '__main__':
+ if len(sys.argv) == 1:
+ print info
+
+ assert len(sys.argv) == 6, help
+ compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)
--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import qpalma.Configuration as Conf
+from compile_dataset import DatasetGenerator
+
+filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/allReads.full'
+
+map_1_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/spliced.heuristic'
+map_2_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/map_2nd.vm'
+
+dg = DatasetGenerator(filtered_fn,map_1_fn,map_2_fn)
+#dg.compile_training_set()
+dg.compile_testing_set()
+dg.saveAs('dataset_19_05_08')