numDonSuppPoints = 30
numAccSuppPoints = 30
numLengthSuppPoints = 30
-numQualSuppPoints = 16
+numQualSuppPoints = 4
min_qual = -1
max_qual = 40
import scipy.io
import pdb
-
-def paths_load_data_solexa(expt,genome_info,PAR):
+def paths_load_data_solexa(expt,genome_info,PAR,test=False):
# expt can be 'training','validation' or 'test'
assert expt in ['training','validation','test']
for line in open(est_filename):
line = line.strip()
- chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,exon_idx = line.split()
+ chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
splitpos = int(splitpos)
length = int(length)
prb = [ord(elem)-50 for elem in prb]
currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
except:
continue
+
+ if len(currentSeq) < (currentGene.stop - currentGene.start):
+ continue
- #assert currentSize == len(Sequences[-1]), 'gene_size %d / seq size %d' % (currentSize,len(Sequences[-1]))
- #Acceptors.append([-inf]*currentSize)
- #Donors.append([-inf]*currentSize)
+ oldSeq = currentSeq
- exon_idx = int(exon_idx)
+ p_start = int(p_start)
+ exon_stop = int(exon_stop)
+ exon_start = int(exon_start)
+ p_stop = int(p_stop)
+
+ assert p_start <= exon_stop <= exon_start <= p_stop, 'Invalid Indices'
+ assert p_stop - p_start >= 36
- assert exon_idx > 0
- #print "%d " % exon_idx,
currentExons = zeros((2,2))
- #for idx in range(len(currentGene.exons)):
- # currentExons[idx,0] = currentGene.exons[idx][0]-currentGene.start
- # currentExons[idx,1] = currentGene.exons[idx][1]-currentGene.start
- try:
- currentExons[0,0] = currentGene.exons[exon_idx-1][0]-currentGene.start
- currentExons[0,1] = currentGene.exons[exon_idx-1][1]-currentGene.start
+ currentExons[0,0] = p_start-currentGene.start
+ currentExons[0,1] = exon_stop-currentGene.start
- currentExons[1,0] = currentGene.exons[exon_idx][0]-currentGene.start
- currentExons[1,1] = currentGene.exons[exon_idx][1]-currentGene.start
+ currentExons[1,0] = exon_start-currentGene.start
+ currentExons[1,1] = p_stop-currentGene.start
- cut_offset = 500
+ import copy
+ oldExons = copy.deepcopy(currentExons)
+
+ up_cut = int(currentExons[0,0])
+ down_cut = int(currentExons[1,1])
- up_cut = int(currentExons[0,0]) - cut_offset
- down_cut = int(currentExons[1,1]) + cut_offset
+ if test:
+ up_cut = up_cut-500
if up_cut < 0:
up_cut = 0
+ down_cut = down_cut+500
+
if down_cut > len(currentSeq):
down_cut = len(currentSeq)
- assert down_cut - up_cut > 0
- assert down_cut - up_cut <= len(currentSeq)
+ currentSeq = currentSeq[up_cut:down_cut]
- currentSeq = currentSeq[up_cut:down_cut]
-
- #pdb.set_trace()
+ currentExons[0,1] += 1
+ #currentExons[1,0] += 1
- if up_cut > 0:
- currentExons[0,0] -= up_cut
- currentExons[0,1] -= up_cut
+ if test:
+ currentExons -= up_cut
+ else:
+ currentExons -= currentExons[0,0]
- currentExons[1,0] -= up_cut
- currentExons[1,1] -= up_cut
-
- except:
- continue
- #pdb.set_trace()
+ try:
+ if not (currentSeq[int(currentExons[0,1])] == 'g' and currentSeq[int(currentExons[0,1])+1] == 't'):
+ continue
- #pdb.set_trace()
+ if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
+ continue
- currentExons[0,1] += 1
- #currentExons[1,0] += 1
+ except:
+ pdb.set_trace()
Sequences.append(currentSeq)
SplitPositions.append(splitpos)
- if len(Sequences[-1]) == 2755:
- Sequences = Sequences[:-1]
- Acceptors = Acceptors[:-1]
- Donors = Donors[:-1]
- Exons = Exons[:-1]
- Ests = Ests[:-1]
- Qualities = Qualities[:-1]
- SplitPositions = SplitPositions[:-1]
+ #if len(Sequences[-1]) == 2755:
+ # Sequences = Sequences[:-1]
+ # Acceptors = Acceptors[:-1]
+ # Donors = Donors[:-1]
+ # Exons = Exons[:-1]
+ # Ests = Ests[:-1]
+ # Qualities = Qualities[:-1]
+ # SplitPositions = SplitPositions[:-1]
print 'found %d examples' % len(Sequences)
return currentWeight
-def computeSpliceWeights(d, a, h, SpliceAlign, don_supp, acc_supp,dec=False):
+def computeSpliceWeights(d, a, h, SpliceAlign, don_supp, acc_supp):
####################################################################################
# 1. Donor: In don_supp stehen Werte der SVM., in SpliceAlign die Einsen
####################################################################################
acc_score = [acc_score[0]] + acc_score
acc_score = acc_score[:-1]
Acceptors.append(acc_score)
- print str((max(don_score),min(numpy.where(numpy.isinf(don_score),100,don_score))))
- print str((max(acc_score),min(numpy.where(numpy.isinf(acc_score),100,acc_score))))
+ #print str((max(don_score),min(numpy.where(numpy.isinf(don_score),100,don_score))))
+ #print str((max(acc_score),min(numpy.where(numpy.isinf(acc_score),100,acc_score))))
#pdb.set_trace()
from qpalma.Plif import Plf
from qpalma.Helpers import *
+from qpalma.tools.splicesites import getDonAccScores
+
def getQualityFeatureCounts(qualityPlifs):
weightQuality = qualityPlifs[0].penalties
for currentPlif in qualityPlifs[1:]:
use_quality_scores = False
elif Conf.mode == 'using_quality_scores':
- #Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
- #end = 50
- #Sequences = Sequences[:end]
- #Acceptors = Acceptors[:end]
- #Donors = Donors[:end]
- #Exons = Exons[:end]
- #Ests = Ests[:end]
- #Qualities = Qualities[:end]
- #SplitPos = SplitPos[:end]
+ Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos =\
+ paths_load_data_solexa('training',None,self.ARGS,True)
+
+ begin = 200
+ end = 400
+ Sequences = Sequences[begin:end]
+ Exons = Exons[begin:end]
+ Ests = Ests[begin:end]
+ Qualities = Qualities[begin:end]
+ Acceptors = Acceptors[begin:end]
+ Donors = Donors[begin:end]
+
+ pdb.set_trace()
+
+ Donors, Acceptors = getDonAccScores(Sequences)
+
+
+ #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
+ #SplitPos = [1]*len(Qualities)
- Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
- SplitPos = [1]*len(Qualities)
#Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
#pdb.set_trace()
#Qualities = []
# Initialize parameter vector / param = numpy.matlib.rand(126,1)
#param_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/python/elegans.param'
- param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_30.pickle'
+ param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_200.pickle'
param = load_param(param_filename)
# Set the parameters such as limits penalties for the Plifs
else:
oldElem = SpliceAlign[pos-1]
- if oldElem == 2 and elem == 0: # start of exon
+ if oldElem != 0 and elem == 0: # start of exon
newExons.append(pos-1)
- if oldElem == 0 and elem == 1: # end of exon
+ if oldElem == 0 and elem != 0: # end of exon
newExons.append(pos)
- pdb.set_trace()
- #up_off = -1
- #down_off = -1
+ up_off = -1
+ down_off = -1
- #if len(newExons) != 4:
- # acc = 0.0
- #else:
- # e1_begin,e1_end = newExons[0],newExons[1]
- # e2_begin,e2_end = newExons[2],newExons[3]
+ if len(newExons) != 4:
+ acc = 0.0
+ else:
+ e1_begin,e1_end = newExons[0],newExons[1]
+ e2_begin,e2_end = newExons[2],newExons[3]
- # up_off = int(math.fabs(e1_end - exons[0,1]))
- # down_off = int(math.fabs(e2_begin - exons[1,0]))
+ up_off = int(math.fabs(e1_end - exons[0,1]))
+ down_off = int(math.fabs(e2_begin - exons[1,0]))
+
+ pdb.set_trace()
- #return up_off,down_off
+ return up_off,down_off
if __name__ == '__main__':
qpalma = QPalma()
use_quality_scores = False
elif Configuration.mode == 'using_quality_scores':
- #Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
- #end = 50
- #Sequences = Sequences[:end]
- #Exons = Exons[:end]
- #Ests = Ests[:end]
- #Qualities = Qualities[:end]
- #SplitPos = SplitPos[:end]
-
- #Donors, Acceptors = getDonAccScores(Sequences)
-
- Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
+ filename = 'real_dset_%s'% 'recent'
+ if True:# not os.path.exists(filename):
+ Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
+
+ end = 200
+ Sequences = Sequences[:end]
+ Exons = Exons[:end]
+ Ests = Ests[:end]
+ Qualities = Qualities[:end]
+ Acceptors = Acceptors[:end]
+ Donors = Donors[:end]
+
+ pdb.set_trace()
+
+ Donors, Acceptors = getDonAccScores(Sequences)
+ #dset = Dataset()
+ #dset.Sequences = Sequences
+ #dset.Acceptor = Acceptors
+ #dset.Donors = Donors
+ #dset.Exons = Exons
+ #dset.Ests = Ests
+ #dset.Qualities = Qualities
+ #cPickle.dump(dset,open(filename,'w+'))
+ else:
+ dset = cPickle.load(open(filename))
+ Sequences = dset.Sequences
+ Acceptors = dset.Acceptors
+ Donors = dset.Donors
+ Exons = dset.Exons
+ Ests = dset.Ests
+ Qualities = dset.Qualities
+
+ #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
#Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
#Qualities = []
#for i in range(len(Ests)):
don_supp = Donors[exampleIdx]
acc_supp = Acceptors[exampleIdx]
- if exons[-1,1] > len(dna):
- continue
+ #if exons[-1,1] > len(dna):
+ # continue
#pdb.set_trace()
# Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)
for pathNr in range(num_path[exampleIdx]):
#print 'decodedWeights'
- weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,
- h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,
- acc_supp,True)
+ weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
+ h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
+ acc_supp)
decodedQualityFeatures = zeros((Configuration.totalQualSuppPoints,1))
for qidx in range(Configuration.totalQualSuppPoints):
int up_idx, down_idx, status;
char* upstream_line = malloc(sizeof(char)*256);
char* downstream_line = malloc(sizeof(char)*256);
+ int p_start, p_stop;
int buffer_size= 64;
new_chastity[0] = '\0';
int splitpos = 0;
- fit = fitting(up_prb+(36-overlap),down_prb);
+ fit = fitting(up_prb+(36-overlap),down_prb+(exon_start-down_pos));
if (fit == -1)
continue;
new_strand = up_strand;
splitpos = overlap;
+ p_start = up_pos;
+ p_stop = exon_start+(read_size-overlap);
+
strncat(new_seq,new_up_seq,overlap);
strncat(new_prb,up_prb,overlap);
strncat(new_cal_prb,up_cal_prb,overlap);
strncat(new_chastity,up_chastity,overlap);
- strncat(new_seq,new_down_seq+fit,read_size-overlap);
- strncat(new_prb,down_prb+fit,read_size-overlap);
- strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap);
- strncat(new_chastity,down_chastity+fit,read_size-overlap);
-
- fprintf(out_fs,"%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\n",
- new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,exon_idx);
+ //strncat(new_seq,new_down_seq+fit,read_size-overlap);
+ //strncat(new_prb,down_prb+fit,read_size-overlap);
+ //strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap);
+ //strncat(new_chastity,down_chastity+fit,read_size-overlap);
+
+ int offset = exon_start-down_pos;
+ strncat(new_seq,new_down_seq+offset,read_size-overlap);
+ strncat(new_prb,down_prb+offset,read_size-overlap);
+ strncat(new_cal_prb,down_cal_prb+offset,read_size-overlap);
+ strncat(new_chastity,down_chastity+offset,read_size-overlap);
+
+ // The four last integers specify the positions of
+ // p_start : the position in the dna where the first read starts
+ // exons_stop : the position in the dna where the first exons ends
+ // exons_start : the position in the dna where the second exons starts
+ // p_stop : the position in the dna where the (truncated) second read ends
+ fprintf(out_fs,"%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
+ new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop);
combined_reads++;
used_flag[down_idx] = 1;
free(upstream_line);
free(downstream_line);
+ free(used_flag);
free(new_up_seq);
free(new_down_seq);
didx = uidx+w_size;
current_mean_up += up_prb[uidx]-50;
current_mean_down += up_prb[didx]-50;
-
}
+
current_mean_up /= w_size;
current_mean_down /= w_size;
didx = uidx+w_size;
current_variance_up += pow(up_prb[uidx]-50 - current_mean_up,2);
current_variance_down += pow(up_prb[didx]-50 - current_mean_up,2);
-
}
+
current_variance_up /= (w_size-1);
current_variance_down /= (w_size-1);