// printf("%f ",pidx,p.penalties[pidx]);
// printf("\n");
//}
+ //
+
+ //int h_idx;
+ //for(h_idx=0;h_idx<h.len;h_idx++)
+ // printf("%d %f %f\n",h_idx,h.limits[h_idx],h.penalties[h_idx]);
//printf("calling fill_matrix...\n");
fill_matrix(nr_paths, matrices, est_len, dna_len, est, dna, prb, &h, matchmatrix, qualityScores, donor, acceptor, remove_duplicate_scores, nr_donor_sites, donor_sites, max_score_positions,currentMode);
mmatrix_param_size = (mlen*mlen)*nr_paths;
mmatrix_size = mlen*mlen;
}
-
+
alignmentscores_size = nr_paths; //alignment score for each path/matrix
numPathsPlifs = numPlifs*nr_paths; //alignment score for each path/matrix
#
C = 1
-min_intron_len = 10
-max_intron_len = 1000
+min_intron_len = 20
+max_intron_len = 2000
min_svm_score = 0.0
max_svm_score = 1.0
# instead.
###############################################################################
-numDonSuppPoints = 30
-numAccSuppPoints = 30
-numLengthSuppPoints = 30
-numQualSuppPoints = 16
+#numLengthSuppPoints = 30
+#numDonSuppPoints = 30
+#numAccSuppPoints = 30
+#numQualSuppPoints = 16
+
+numLengthSuppPoints = 20
+numDonSuppPoints = 20
+numAccSuppPoints = 20
+numQualSuppPoints = 4
min_qual = -1
max_qual = 40
###############################################################################
training_begin = 0
-training_end = 1500
+training_end = 2000
-prediction_begin = 0
-prediction_end = 1500
+prediction_begin = 2000
+prediction_end = 3500
joinp = os.path.join
annot_path = joinp(data_path,'annotation_data')
remapped_path = joinp(data_path,'remapped_solexa_data')
-dna_flat_fn = joinp(data_path,'allGenes.pickle')
-gff_fn = joinp(annot_path,'TAIR7_GFF3_genes_Chr1.gff_v1')
-filtered_fn = joinp(data_path,'filteredReads_1_recent')
+dna_flat_fn = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+gff_fn = joinp(annot_path,'TAIR7_GFF3_genes_Chr%s.gff_v1')
+filtered_fn = joinp(data_path,'allFilteredReads_04_02_2008')
remapped_fn = joinp(remapped_path,'map_best_hit.18.unambig')
dataset_fn = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/chr1_dataset.pickle'
assert numQualSuppPoints > 1
assert os.path.exists(dna_flat_fn), 'DNA data does not exist!'
-assert os.path.exists(gff_fn), 'EST/Reads data does not exist!'
assert os.path.exists(filtered_fn), 'EST/Reads data does not exist!'
assert os.path.exists(remapped_fn), 'EST/Reads data does not exist!'
totalQualSP = Configuration.totalQualSuppPoints
numQPlifs = Configuration.numQualPlifs
- regC = self.numFeatures / 1.0*self.numExamples
+
+ #lengthGroupParam = 5*1e-9
+ #spliceGroupParam = 1e-8
+
+ lengthGroupParam = 5*1e-9
+ spliceGroupParam = 1e-9
+
+ self.P[0:lengthSP,0:lengthSP] *= lengthGroupParam
+
+ beg = lengthSP
+ end = lengthSP+donSP+accSP
+ self.P[beg:end,beg:end] *= spliceGroupParam
+
+ regC = self.numFeatures / (1.0*self.numExamples)
+
+ #cfactor = 1e-8
+ cfactor = 5e-9
for j in range(lengthSP-1):
- self.P[j,j+1] = -1.0
- self.P[j+1,j] = -1.0
- self.P[j,j] += 1.0
+ self.P[j,j+1] = -cfactor
+ self.P[j+1,j] = -cfactor
+ self.P[j,j] += cfactor
+ self.P[j+1,j+1] += cfactor
+
+ #cfactor = 1e-8
+ cfactor = 5e-9
for j in range(lengthSP,lengthSP+donSP-1):
- self.P[j,j+1] = -1.0
- self.P[j+1,j] = -1.0
- self.P[j,j] += 1.0
+ cfactor = spliceGroupParam
+ self.P[j,j+1] = -cfactor
+ self.P[j+1,j] = -cfactor
+ self.P[j,j] += cfactor
+ self.P[j+1,j+1] += cfactor
for j in range(lengthSP+donSP,lengthSP+donSP+accSP-1):
- self.P[j,j+1] = -1.0
- self.P[j+1,j] = -1.0
- self.P[j,j] += 1.0
+ cfactor = spliceGroupParam
+ self.P[j,j+1] = -cfactor
+ self.P[j+1,j] = -cfactor
+ self.P[j,j] += cfactor
+ self.P[j+1,j+1] += cfactor
offset = lengthSP+donSP+accSP+mmatrixSP
for k in range(numQPlifs):
# 0.25 for each was already good
- lengthGroupParam = 0.005
- spliceGroupParam = 0.005
+ matchGroupParam = 1.0
+ qualityGroupParam = 1.0
- matchGroupParam = 0.495
- qualityGroupParam = 0.495
+ # lengthGroupParam = 0.005
+ # spliceGroupParam = 0.005
+ # matchGroupParam = 0.495
+ # qualityGroupParam = 0.495
- self.P[0:lengthSP,0:lengthSP] *= lengthGroupParam
+ #lengthGroupParam = lengthSP / (1.0*lengthSP+donSP+accSP)
+ #spliceGroupParam = donSP+accSP / (1.0*lengthSP+donSP+accSP)
+ #matchGroupParam = mmatrixSP/(1.0*mmatrixSP+totalQualSP)
+ #qualityGroupParam = totalQualSP/(1.0*mmatrixSP+totalQualSP)
+
+
+ #denom = (1.0*lengthSP+donSP+accSP+mmatrixSP+totalQualSP)
+ #ratio1 = lengthSP+donSP+accSP / denom
+ #ratio2 = mmatrixSP+totalQualSP / denom
+
+ #lengthGroupParam *= ratio1
+ #spliceGroupParam *= ratio1
+
+ #matchGroupParam *= ratio2
+ #qualityGroupParam *= ratio2
- beg = lengthSP
- end = lengthSP+donSP+accSP
- self.P[beg:end,beg:end] *= spliceGroupParam
beg = lengthSP+donSP+accSP
end = lengthSP+donSP+accSP+mmatrixSP
beg = lengthSP+donSP+accSP+mmatrixSP
self.P[beg:-self.numExamples,beg:-self.numExamples] *= qualityGroupParam
- self.P[0:-self.numExamples,0:-self.numExamples] *= regC*1e-3
+ self.P[0:-self.numExamples,0:-self.numExamples] *= regC
def createRegularizer(self):
# y_i and y_i+1 parameters, so the fractions are saved in the weight vectors!
def calculateWeights(plf, scores):
- currentWeight = zeros((30,1))
+ currentWeight = zeros((len(plf.limits),1))
for k in range(len(scores)):
value = scores[k]
for pos in range(len(intron_starts)):
values[pos] = intron_ends[pos] - intron_starts[pos] + 1
+ #print 'introns found: ' + str(values)
weightIntron = calculateWeights(h,values)
return weightDon, weightAcc, weightIntron
import numpy.matlib
import QPalmaDP
import pdb
-import Configuration
+import Configuration as Conf
from Plif import *
def set_param_palma(param, train_with_intronlengthinformation,\
h = Plf()
d = Plf()
a = Plf()
- qualityPlifs = [None]*Configuration.numQualPlifs
+ qualityPlifs = [None]*Conf.numQualPlifs
- donSP = Configuration.numDonSuppPoints
- accSP = Configuration.numAccSuppPoints
- lengthSP = Configuration.numLengthSuppPoints
- mmatrixSP = Configuration.sizeMatchmatrix[0]\
- *Configuration.sizeMatchmatrix[1]
- totalQualSP = Configuration.totalQualSuppPoints
+ donSP = Conf.numDonSuppPoints
+ accSP = Conf.numAccSuppPoints
+ lengthSP = Conf.numLengthSuppPoints
+ mmatrixSP = Conf.sizeMatchmatrix[0]\
+ *Conf.sizeMatchmatrix[1]
+ totalQualSP = Conf.totalQualSuppPoints
####################
# Gapfunktion
# Matchmatrix
####################
mmatrix = numpy.matlib.mat(param[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP])
- mmatrix.reshape(Configuration.sizeMatchmatrix[0],Configuration.sizeMatchmatrix[1])
+ mmatrix.reshape(Conf.sizeMatchmatrix[0],Conf.sizeMatchmatrix[1])
####################
# Quality Plifs
####################
- for idx in range(Configuration.numQualPlifs):
+ for idx in range(Conf.numQualPlifs):
currentPlif = Plf()
- currentPlif.limits = linspace(Configuration.min_qual,Configuration.max_qual,Configuration.numQualSuppPoints)
- begin = lengthSP+donSP+accSP+mmatrixSP+(idx*Configuration.numQualSuppPoints)
- end = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Configuration.numQualSuppPoints)
+ currentPlif.limits = linspace(Conf.min_qual,Conf.max_qual,Conf.numQualSuppPoints)
+ begin = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
+ end = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
currentPlif.penalties = param[begin:end].flatten().tolist()[0]
currentPlif.transform = ''
currentPlif.name = 'q'
- currentPlif.max_len = Configuration.max_qual
- currentPlif.min_len = Configuration.min_qual
+ currentPlif.max_len = Conf.max_qual
+ currentPlif.min_len = Conf.min_qual
qualityPlifs[idx] = currentPlif
return h,d,a,mmatrix,qualityPlifs
- 'remappped_pos' - the position of the remapped read
"""
-def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file):
-
- assert os.path.exists(gff_file)
- #for file in dna_flat_files:
- # assert os.path.exists(file)
+def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
assert os.path.exists(filtered_reads)
assert os.path.exists(remapped_reads)
joinp = os.path.join
# first read the gff file(s) and create pickled gene information
- allGenes = parse_gff(gff_file) #,joinp(tmp_dir,'gff_info.pickle'))
-
- print 'parsing filtered reads'
- frp = FilteredReadParser(filtered_reads)
+ allGenes = [None]*6
- print 'parsing remapped reads'
+ for i in range(1,6):
+ allGenes[i]= parse_gff(gff_file%i) #,joinp(tmp_dir,'gff_info.pickle'))
+
+ print 'parsing filtered reads..'
+ frp = FilteredReadParser(filtered_reads)
all_filtered_reads = frp.parse()
+ print 'found %d filtered reads' % len(all_filtered_reads)
+ print 'parsing remapped reads'
rrp = RemappedReadParser(remapped_reads)
all_remapped_reads = rrp.parse()
+ if all_remapped_reads != None:
+ size_rem = len(all_remapped_reads)
+ else:
+ size_rem = 0
+
+ print 'found %d remapped reads' % size_rem
# this stores the new dataset
Sequences = []
Qualities = []
SplitPositions = []
Ids = []
+
+ #pdb.set_trace()
# Iterate over all remapped reads in order to generate for each read an
# training / prediction example
currentRReads = None
gene_id = currentFRead['gene_id']
- currentGene = allGenes[gene_id]
+ chr = currentFRead['chr']
+ currentGene = allGenes[chr][gene_id]
+
+ if currentFRead['strand'] != '+':
+ continue
- #seq, acc, don, exons, est, qual, spos = process_read(currentRReads,currentFRead,currentGene,genome_file,sscore_filename)
- seq, acc, don, exons, est, qual, spos = process_read(currentRReads,currentFRead,currentGene)
+ seq, acc, don, exons, est, qual, spos = process_read(currentRReads,currentFRead,currentGene,dna_flat_files,test)
if seq == '':
continue
io_pickle.save(dataset_file,dataset)
-#def process_read(reReads,fRead,currentGene,genome_file,sscore_filename):
-def process_read(reReads,fRead,currentGene):
+def process_read(reReads,fRead,currentGene,dna_flat_files,test):
"""
The purpose of this function is to create an example for QPalma
by using a
"""
+ nil = ('','','','','','','')
+
chr = fRead['chr']
strand = fRead['strand']
chrom = 'chr%d' % chr
- genome_file = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s' % (chr,strand)
# use matlab-style functions to access dna sequence
- currentSeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,genome_file,one_based=True)
+ currentSeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,dna_flat_files,one_based=True)
currentSeq = currentSeq.lower()
- if strand == '-':
- currentSeq = reverse_complement(currentSeq)
-
- nil = ('','','','','','','')
+ assert currentSeq != '', 'load_genomic returned empty sequence!'
- if len(currentSeq) < (currentGene.stop - currentGene.start):
- return nil
+ if strand == '-':
+ try:
+ currentSeq = reverse_complement(currentSeq)
+ except:
+ print 'invalid char'
+ return nil
p_start = fRead['p_start']
exon_stop = fRead['exon_stop']
quality = fRead['prb']
spos = fRead['splitpos']
- #if fRead['chr'] != 1 and fRead['strand'] != 'D':
- # return nil
-
assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
assert p_stop - p_start >= 36
# if we perform testing we cut a much wider region as we want to detect
# how good we perform on a region
- test = False
-
if test:
up_cut = up_cut-500
if up_cut < 0:
try:
if not (currentSeq[int(currentExons[0,1])] == 'g' and\
currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
+ print 'not aligned'
return nil
if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
+ print 'not aligned'
return nil
except:
# now we want to use interval_query to get the predicted splice scores
# trained on the TAIR sequence and annotation
-
interval_matrix = createIntArrayFromList([currentGene.start+up_cut-100,currentGene.start+down_cut+100])
num_fields = 1
num_intervals = 1
- sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_1+'
+ sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_%d%s'\
+ % (chr,strand)
ag_tuple_pos = [p for p,e in enumerate(currentSeq) if p>0 and p<len(currentSeq)-1 and currentSeq[p-1]=='a' and e=='g' ]
del gf
# fetch donor scores
- sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+'
+ sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s'\
+ % (chr,strand)
+
gf = Genef()
num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
assert num_hits <= len(currentSeq)
don = [-inf] + don[:-1]
- #for p,e in enumerate(acc):
- # if p not in ag_tuple_pos:
- # acc[p] = -inf
-
- #for p,e in enumerate(don):
- # if p not in gt_tuple_pos:
- # don[p] = -inf
-
assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
return new_seq
-
if __name__ == '__main__':
if len(sys.argv) == 1:
print info
self.ARGS = Param()
#from compile_dataset import compile_d
- #compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn)
+ #compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,False)
+
+ from compile_dataset import compile_d
+ compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn,True)
#gen_file= '%s/genome.config' % self.ARGS.basedir
#ginfo_filename = 'genome_info.pickle'
print 'Donor max/min: %f / %f' % (max_score,min_score)
+ min_score = 10000
+ max_score = 0
+ mean_intron_len = 0
+
+ for elem in self.Exons:
+ intron_len = int(elem[1,0] - elem[0,1])
+ mean_intron_len += intron_len
+ min_score = min(min_score,intron_len)
+ max_score = max(max_score,intron_len)
+
+ mean_intron_len /= 1.0*len(self.Exons)
+ print 'Intron length max/min: %d / %d mean length %f' % (max_score,min_score,mean_intron_len)
+
+
self.use_quality_scores = True
else:
assert(False)
# Set the parameters such as limits penalties for the Plifs
[h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
+ pdb.set_trace()
+
# Initialize solver
if Conf.USE_OPT:
self.plog('Initializing problem...\n')
# Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
trueSpliceAlign, trueWeightMatch, trueWeightQuality = computeSpliceAlignWithQuality(dna, exons)
- #print 'trueWeights'
# Calculate the weights
trueWeightDon, trueWeightAcc, trueWeightIntron = computeSpliceWeights(d, a, h, trueSpliceAlign, don_supp, acc_supp)
trueWeight = numpy.vstack([trueWeightIntron, trueWeightDon, trueWeightAcc, trueWeightMatch, trueWeightQuality])
+ #idx_lst = [p for p,e in enumerate(trueWeightIntron) if e > 1e-5]
+ #print idx_lst
+ #print [h.limits[p] for p in idx_lst]
+
currentPhi[0:donSP] = mat(d.penalties[:]).reshape(donSP,1)
currentPhi[donSP:donSP+accSP] = mat(a.penalties[:]).reshape(accSP,1)
currentPhi[donSP+accSP:donSP+accSP+lengthSP] = mat(h.penalties[:]).reshape(lengthSP,1)
currentPhi[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP] = mmatrix[:]
- #pdb.set_trace()
if Conf.mode == 'using_quality_scores':
totalQualityPenalties = param[-totalQualSP:]
currentPhi[donSP+accSP+lengthSP+mmatrixSP:] = totalQualityPenalties[:]
for i in range(num_path[exampleIdx]):
newDPScores[i] = c_DPScores[i]
-
if self.use_quality_scores:
for i in range(Conf.totalQualSuppPoints*num_path[exampleIdx]):
newQualityPlifsFeatures[i] = c_qualityPlifsFeatures[i]
wp = numpy.vstack([weightIntron, weightDon, weightAcc, newWeightMatch[pathNr,:].T, decodedQualityFeatures])
allWeights[:,pathNr+1] = wp
+ #pdb.set_trace()
+
hpen = mat(h.penalties).reshape(len(h.penalties),1)
dpen = mat(d.penalties).reshape(len(d.penalties),1)
apen = mat(a.penalties).reshape(len(a.penalties),1)
if math.fabs(AlignmentScores[pathNr] - AlignmentScores[pathNr+1]) > 1e-5:
distinct_scores = True
- #if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) + [0,1][distinct_scores and (pathNr>0)] <= 1e-5:
if not math.fabs(newDPScores[pathNr,0] - AlignmentScores[pathNr+1]) <= 1e-5:
pdb.set_trace()
- # # if the pathNr-best alignment is very close to the true alignment consider it as true
+ # if the pathNr-best alignment is very close to the true alignment consider it as true
if norm( allWeights[:,0] - allWeights[:,pathNr+1] ) < 1e-5:
true_map[pathNr+1] = 1
if len([elem for elem in true_map if elem == 1]) == len(true_map):
num_path[exampleIdx] = num_path[exampleIdx]+1
- # Choose true and first false alignment for extending A
+ # Choose true and first false alignment for extending
firstFalseIdx = -1
for map_idx,elem in enumerate(true_map):
if elem == 0:
def predict(self,param_filename,beg,end):
self.logfh = open('_qpalma_predict.log','w+')
- beg = Conf.prediction_begin
- end = Conf.prediction_end
-
Sequences = self.Sequences[beg:end]
Exons = self.Exons[beg:end]
Ests = self.Ests[beg:end]
--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import pdb
+
+import qpalma
+from qpalma.tools.parseGff import parse_gff
+
+def filter(gff_file,reads,out):
+ allGenes = parse_gff(gff_file)
+
+ out_fh = open(out,'w+')
+
+
+ out_fh.close()
+
+if __name__ == '__main__':
+ assert len(sys.argv) == 4, 'Usage: '
+ (gff_file,reads,out) = sys.argv[1:4]
+ filter(gff_file,reads,out)
+