void fill_matrix(int nr_paths, Pre_score* matrices[], int est_len /*counter var: i*/, int dna_len /*counter var: j*/, char* est, char* dna, penalty_struct* functions , double* matchmatrix, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions)
{
- printf("Entering fill_matrix...\n");
+ // printf("Entering fill_matrix...\n");
/*********************************************************************************************/
// variables
double prevValue ;
double tempValue;
int max_len = (int)functions->limits[functions->len-1] ; //last (len) supporting point of h
- printf("max_len: %d\n", max_len) ;
+ //printf("max_len: %d\n", max_len) ;
int num_elem ; // counter variable
delete[] array;
delete[] scores;
- printf("Leaving fill_matrix...\n");
+ // printf("Leaving fill_matrix...\n");
return;
}
id = 0;
name = 0;
use_svm = 0;
-}
-Alignment::~Alignment() {}
+ splice_align = 0;
+ est_align = 0;
+ mmatrix_param = 0;
+ alignmentscores = 0;
+ qualityMatrix = 0;
+}
void Alignment::setQualityMatrix(double* qMat, int length){
- qualityMatrix = new double[length];
+ if(qualityMatrix == 0)
+ qualityMatrix = new double[length];
+
for(int i=0; i<length; i++)
qualityMatrix[i] = qMat[i];
}
double* donor, int d_len, double* acceptor, int a_len,
bool remove_duplicate_scores, bool print_matrix) {
- printf("Entering myalign...\n");
+ // printf("Entering myalign...\n");
mlen = 6; // score matrix: length of 6 for "- A C G T N"
dna_len = dna_len_p + 1 ;
matrices[z] = new Pre_score[dna_len * est_len];
}
- printf("calling fill_matrix...\n");
+ // printf("calling fill_matrix...\n");
fill_matrix(nr_paths, matrices, est_len, dna_len, est, dna, &h, matchmatrix, donor, acceptor, remove_duplicate_scores, nr_donor_sites, donor_sites, max_score_positions);
- printf("after call to fill_matrix...\n");
+ // printf("after call to fill_matrix...\n");
/***************************************************************************/
// return arguments etc.
/***************************************************************************/
int result_length; //Eine Variable fuer alle Matrizen
- printf("before init of splice_align and rest...\n");
+ // printf("before init of splice_align and rest...\n");
splice_align = new int[(dna_len-1)*nr_paths];
est_align = new int[(est_len-1)*nr_paths];
mmatrix_param = new int[(mlen*mlen)*nr_paths];
alignmentscores = new double[nr_paths]; //alignment score for each path/matrix
- printf("before memset...\n");
+ // printf("before memset...\n");
memset((char*)splice_align, -1, (dna_len-1)*nr_paths*sizeof(int)); // fills splice_align with zeros
memset((char*)est_align, -1, (est_len-1)*nr_paths*sizeof(int)); // fills est_align with zeros
memset((char*)mmatrix_param, 0, mlen*mlen*nr_paths*sizeof(int)); //fills mmatrix_param with zeros
memset(alignmentscores, -1, nr_paths*sizeof(double)); //fills alignmentscores with zeros
- printf("after memset...\n");
+ // printf("after memset...\n");
+ // dnaest
+ double *DNA_ARRAY = 0;
+ double *EST_ARRAY = 0;
for (int z=0; z<nr_paths; z++) {
result_length = 0 ;
bool no_more_path = result_align(matrices, z, est_len, dna_len, &result_length, est, dna, s_align, e_align, mparam, alignmentscores, max_score_positions);
- // dnaest
- double *DNA = new double[result_length];
- double *EST = new double[result_length];
-
+ if(DNA_ARRAY != 0) {
+ delete[] DNA_ARRAY;
+ delete[] EST_ARRAY;
+ }
+
+ DNA_ARRAY = new double[result_length];
+ EST_ARRAY = new double[result_length];
+
//backtracking
int i = max_score_positions[2*z] ; //i (est)
int j = max_score_positions[2*z +1] ; //j (dna)
for (int ii=result_length; ii>0; ii--) {
if ((prev_i == (i-1)) && (prev_j == (j-1))) { // match or mismatch
- DNA[ii-1] = check_char(dna[j-1]) ;
- EST[ii-1] = check_char(est[i-1]) ;
+ DNA_ARRAY[ii-1] = check_char(dna[j-1]) ;
+ EST_ARRAY[ii-1] = check_char(est[i-1]) ;
}
- else if ((prev_i == (i)) && (prev_j == (j-1))) {// gap on EST
- DNA[ii-1] = check_char(dna[j-1]) ;
- EST[ii-1] = 0 ; //gap
+ else if ((prev_i == (i)) && (prev_j == (j-1))) {// gap on EST_ARRAY
+ DNA_ARRAY[ii-1] = check_char(dna[j-1]) ;
+ EST_ARRAY[ii-1] = 0 ; //gap
}
- else if ((prev_i == (i-1)) && (prev_j == (j))) { // gap on DNA
- DNA[ii-1] = 0 ; //gap
- EST[ii-1] = check_char(est[i-1]) ;
+ else if ((prev_i == (i-1)) && (prev_j == (j))) { // gap on DNA_ARRAY
+ DNA_ARRAY[ii-1] = 0 ; //gap
+ EST_ARRAY[ii-1] = check_char(est[i-1]) ;
}
else {// splice site
for (j; j > prev_j; j--) {
- DNA[ii-1] = check_char(dna[j-1]) ;
- EST[ii-1] = 6 ; //intron
+ DNA_ARRAY[ii-1] = check_char(dna[j-1]) ;
+ EST_ARRAY[ii-1] = 6 ; //intron
ii-- ;
}
ii++ ; // last ii-- too much (because done in for-loop)
} //end of z
- printf("Leaving myalign...\n");
+ if(DNA_ARRAY != 0) {
+ delete[] DNA_ARRAY;
+ delete[] EST_ARRAY;
+ }
+
+ for (int i=nr_paths-1;i>=0; i--)
+ delete[] matrices[i];
+
+ //printf("Leaving myalign...\n");
}
void Alignment::getAlignmentResults(int* s_align, int* e_align,
int* mmatrix_p, double* alignscores) {
- printf("Entering getAlignmentResults...\n");
+ // printf("Entering getAlignmentResults...\n");
uint splice_align_size = (dna_len-1)*nr_paths;
uint est_align_size = (est_len-1)*nr_paths;
uint mmatrix_param_size = (mlen*mlen)*nr_paths;
for(int idx=0; idx<alignmentscores_size; idx++)
alignscores[idx] = alignmentscores[idx];
- printf("Leaving getAlignmentResults...\n");
+ // printf("Leaving getAlignmentResults...\n");
}
int* est_align;
int* mmatrix_param;
double* alignmentscores;
-
double* qualityMatrix;
int dna_len;
public:
Alignment();
- ~Alignment();
+ ~Alignment() {
+ if(splice_align != 0)
+ delete[] splice_align;
+
+ if(est_align != 0)
+ delete[] est_align;
+
+ if(mmatrix_param != 0)
+ delete[] mmatrix_param;
+
+ if(alignmentscores != 0)
+ delete[] alignmentscores;
+
+ if(qualityMatrix != 0)
+ delete[] qualityMatrix;
+ }
void myalign(int nr_paths_p, char* dna, int dna_len_p, char* est,
int est_len_p, struct penalty_struct h, double* matchmatrix, int mm_len,
def __init__(self):
""" default parameters """
- #self.basedir = '/fml/ag-raetsch/share/projects/qpalma/elegans_palma'
- self.basedir = '/fml/ag-raetsch/share/projects/qpalma/zebrafish'
+ self.basedir = '/fml/ag-raetsch/share/projects/qpalma/elegans_palma2'
+ #self.basedir = '/fml/ag-raetsch/share/projects/qpalma/zebrafish'
self.MAX_MEM = 31000;
self.LOCAL_ALIGN = 1;
self.init_param = 1;
self.C = 0.001;
self.microexon = 0;
self.prob = 0;
- self.organism = 'zebrafish'
+ self.organism = 'elegans'
self.expt = 'training'
self.insertion_prob = self.prob/3 ;
from SIQP_CPX import SIQPSolver
from paths_load_data import *
+from paths_load_data_pickle import *
+
from computeSpliceWeights import *
from set_param_palma import *
from computeSpliceAlign import *
def run(self):
# Load the whole dataset
- Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',self.genome_info,self.ARGS)
-
- #Sequences, Acceptors, Donors, Exons, Ests, QualityScores = paths_load_data('training',self.genome_info,self.ARGS)
-
+ #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data('training',self.genome_info,self.ARGS)
+ Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
# number of training instances
N = len(Sequences)
and N == len(Ests), 'The Seq,Accept,Donor,.. arrays are of different lengths'
self.plog('Number of training examples: %d\n'% N)
- iteration_steps = 10 ; #upper bound on iteration steps
+ iteration_steps = 200 ; #upper bound on iteration steps
remove_duplicate_scores = False
print_matrix = False
# Set the parameters such as limits penalties for the Plifs
[h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-
# delete splicesite-score-information
if not self.ARGS.train_with_splicesitescoreinformation:
for i in range(len(Acceptors)):
break
for exampleIdx in range(self.numExamples):
- if exampleIdx% 1000 == 0:
+ if (exampleIdx%10) == 0:
print 'Current example nr %d' % exampleIdx
dna = Sequences[exampleIdx]
#print 'PYTHON: Calling myalign...'
# calculates SpliceAlign, EstAlign, weightMatch, Gesamtscores, dnaest
currentAlignment.myalign( num_path[exampleIdx], dna, dna_len,\
- est, est_len, ps, matchmatrix, mm_len, donor, d_len,\
- acceptor, a_len, remove_duplicate_scores, print_matrix)
+ est, est_len, ps, matchmatrix, mm_len, donor, d_len,\
+ acceptor, a_len, remove_duplicate_scores, print_matrix)
#print 'PYTHON: After myalign call...'
newSpliceAlign = QPalmaDP.createIntArrayFromList([0]*(dna_len*num_path[exampleIdx]))
newAlignmentScores = QPalmaDP.createDoubleArrayFromList([.0]*num_path[exampleIdx])
currentAlignment.getAlignmentResults(newSpliceAlign, newEstAlign, newWeightMatch, newAlignmentScores)
+ del currentAlignment
spliceAlign = zeros((num_path[exampleIdx]*dna_len,1))
weightMatch = zeros((num_path[exampleIdx]*mm_len,1))
for elem in self.slacks:
sum_xis += elem
- for i in range(len(param)):
- param[i] = w[i]
+ for i in range(len(param)):
+ param[i] = w[i]
+
+ [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
- [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
-
- if exampleIdx==10:
+ #
+ # end of one example processing
+ #
+ if exampleIdx == 10:
break
+
+ break
+ #
+ # end of one iteration through all examples
+ #
iteration_nr += 1
- export_param('test_params',h,d,a,mmatrix)
+ #
+ # end of optimization
+ #
+ export_param('elegans.param',h,d,a,mmatrix)
self.logfh.close()
print 'Training completed'
def convert2SWIG(self):
ps = QPalmaDP.penalty_struct()
+
+ ps.len = len(self.limits)
ps.limits = QPalmaDP.createDoubleArrayFromList(self.limits)
ps.penalties = QPalmaDP.createDoubleArrayFromList(self.penalties)
--- /dev/null
+""" --- palma ---
+ Some useful functions and classes used by palma.py
+"""
+
+__version__ = '0.3.3'
+
+# Copyright notice string
+__copyright__ = """\
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+Written (W) 2006 Uta Schulze
+Copyright (C) 2006 Max-Planck-Society
+"""
+
+
+import sys
+import numpy
+import palma_utils as pu
+import alignment_tool as at
+
+
+class alignment_info:
+ nr_paths_found = -1
+ score = []
+ match = []
+ mism = []
+ qGapb = []
+ tGapb = []
+ qStart = []
+ qEnd = []
+ tStart = []
+ tEnd = []
+ num_exons = []
+ qExonSizes = []
+ qStarts = []
+ qEnds = []
+ tExonSizes = []
+ tStarts = []
+ tEnds = []
+ exon_length = []
+ identity = []
+ comparison = []
+ qIDX = []
+ tIDX = []
+ dna_letters = []
+ est_letters = []
+
+def do_alignment(dna, dna_len, est, est_len, reduce_region, model, t_strand,\
+ translation, comparison_char, don_support_values, acc_support_values):
+ import time
+
+ nr_paths = 1
+ mlen = 6 #length of match matrix, const
+
+ align_info = alignment_info()
+ align_info.score = []
+ align_info.match = []
+ align_info.mism = []
+ align_info.qGapb = []
+ align_info.tGapb = []
+ align_info.qStart = []
+ align_info.qEnd = []
+ align_info.tStart = []
+ align_info.tEnd = []
+ align_info.num_exons = []
+ align_info.qExonSizes = []
+ align_info.qStarts = []
+ align_info.qEnds = []
+ align_info.tExonSizes = []
+ align_info.tStarts = []
+ align_info.tEnds = []
+ align_info.exon_length = []
+ align_info.identity = []
+ align_info.comparison = []
+ align_info.qIDX = []
+ align_info.tIDX = []
+ align_info.dna_letters = []
+ align_info.est_letters = []
+ align_info.splice_align = []
+
+ t0 = time.time()
+ #<------------------------------------------------------------------------------>
+ # assigns parameter [h(30),d(30),a(30),mmatrix(36)]
+ # computes scores for donor/acceptor sites using p.l. function
+ #<------------------------------------------------------------------------------>
+ h,d,a,mmatrix = pu.set_param(model)
+ donor = don_support_values
+ acceptor = acc_support_values
+ for cnt in range(dna_len):
+ if numpy.isfinite(don_support_values[cnt]):
+ donor[cnt] = pu.penalty_lookup(d,don_support_values[cnt])
+ if numpy.isfinite(acc_support_values[cnt]):
+ acceptor[cnt] = pu.penalty_lookup(a,acc_support_values[cnt])
+ del don_support_values
+ del acc_support_values
+ #<------------------------------------------------------------------------------>
+
+ #print time.time() - t0
+
+ #<------------------------------------------------------------------------------>
+ # convert list to double-Array (py -> c++),
+ #<------------------------------------------------------------------------------>
+ mmatrix_array = at.createDoubleArrayFromList(mmatrix)
+ donor_array = at.createDoubleArrayFromList(donor)
+ acceptor_array = at.createDoubleArrayFromList(acceptor)
+ h_struct = at.penalty()
+ h_struct.len = h.len
+ h_struct.limits = at.createDoubleArrayFromList(h.limits)
+ h_struct.penalties = at.createDoubleArrayFromList(h.penalties)
+ h_struct.name = h.name
+ h_struct.max_len = h.max_len
+ h_struct.min_len = h.min_len
+ h_struct.transform = h.transform
+ #<------------------------------------------------------------------------------>
+
+ #print time.time() - t0
+
+ #<------------------------------------------------------------------------------>
+ # call myalign
+ #<------------------------------------------------------------------------------>
+ BLA = at.align_info() ; #not necessary
+ try:
+ BLA = at.myalign(reduce_region, mlen, dna_len, dna, est_len, est, h_struct,\
+ mmatrix_array, donor_array, acceptor_array)
+ except MemoryError:
+ raise(MemoryError)
+
+ #print time.time() - t0
+
+#<------------------------------------------------------------------------------>
+
+
+#<------------------------------------------------------------------------------>
+# myalign Output - convert arrays to list (c++ -> py)
+#<------------------------------------------------------------------------------>
+ splice_align_array = at.createListFromIntArray(BLA.splice_align, dna_len*nr_paths)
+ est_align_array = at.createListFromIntArray(BLA.est_align, est_len*nr_paths)
+ mmatrix_param_array = at.createListFromIntArray(BLA.mmatrix_param, mlen*mlen*nr_paths)
+ alignment_scores = at.createListFromDoubleArray(BLA.alignment_scores,nr_paths)
+ alignment_length = at.createListFromIntArray(BLA.alignment_length, nr_paths)
+ #print "before crash on intel mac"
+ aligned_dna_array = at.createListFromIntArray(BLA.aligned_dna, alignment_length[0])
+ aligned_est_array = at.createListFromIntArray(BLA.aligned_est, alignment_length[0])
+ #print "after crash on intel mac"
+#<------------------------------------------------------------------------------>
+
+ #print alignment_scores
+
+#<------------------------------------------------------------------------------>
+# free memory
+#<------------------------------------------------------------------------------>
+ at.delete_intArray(BLA.splice_align)
+ at.delete_intArray(BLA.est_align)
+ at.delete_intArray(BLA.mmatrix_param)
+ at.delete_intArray(BLA.aligned_dna)
+ at.delete_intArray(BLA.aligned_est)
+ at.delete_doubleArray(BLA.alignment_scores)
+ at.delete_intArray(BLA.alignment_length)
+
+ del BLA
+ del h_struct
+ at.delete_doubleArray(mmatrix_array)
+ at.delete_doubleArray(donor_array)
+ at.delete_doubleArray(acceptor_array)
+#<------------------------------------------------------------------------------>
+
+
+#<------------------------------------------------------------------------------>
+# myalign output: creating lists of information
+# i.th entry in list is alignment for i.th path (nr_path many)
+#<------------------------------------------------------------------------------>
+ mmatrix_param = []
+ splice_align = []
+ est_align = []
+ dna_numbers = []
+ est_numbers = []
+ dna_letters = []
+ est_letters = []
+
+ for i in range(nr_paths):
+ result_length = alignment_length[i] #length of i.th alignment
+ if result_length == 0:
+ #no alignment found
+ nr_paths = i-1+1 #+1 because index starts with zero
+ alignment_scores = alignment_scores[0:nr_paths]
+ alignment_length = alignment_length[0:nr_paths]
+ assert(len(mmatrix_param)==nr_paths)
+ assert(len(splice_align)==nr_paths)
+ assert(len(est_align)==nr_paths)
+ assert(len(dna_numbers)==nr_paths)
+ assert(len(est_numbers)==nr_paths)
+ assert(len(dna_letters)==nr_paths)
+ assert(len(est_letters)==nr_paths)
+ break
+
+ #match matrix for the i.th path (numbers of matches, mismatches and gaps):
+ mmatrix_param.append(mmatrix_param_array[i*mlen*mlen:(i+1)*mlen*mlen])
+
+ #splice_align and est_align for the i.th path (show dangling ends and exons/introns):
+ splice_align.append(splice_align_array[i*dna_len:(i+1)*dna_len])
+ est_align.append(est_align_array[i*est_len:(i+1)*est_len])
+
+ dna_numbers.append(aligned_dna_array[0:result_length]) #alignment in numbers (see translation)
+ est_numbers.append(aligned_est_array[0:result_length]) #alignment in numbers (see translation)
+ aligned_dna_array = aligned_dna_array[result_length:] #shorten aligned_dna_array
+ aligned_est_array = aligned_est_array[result_length:] #shorten aligned_est_array
+
+ dna_letters.append(reduce(lambda x,y: x+y, map(lambda x: translation[int(x)],dna_numbers[i])))
+ est_letters.append(reduce(lambda x,y: x+y, map(lambda x: translation[int(x)],est_numbers[i])))
+
+ if nr_paths==0:
+ #absolutely no alignment found
+ align_info.nr_paths_found = nr_paths
+ return align_info
+
+ assert(len(aligned_dna_array)==0)
+ assert(len(aligned_est_array)==0)
+#<------------------------------------------------------------------------------>
+
+
+#<------------------------------------------------------------------------------>
+# analysing alignments (n-best)
+#<------------------------------------------------------------------------------>
+ if 0:
+ print alignment_scores
+ print alignment_length
+ print mmatrix_param
+ print splice_align
+ print est_align
+ print dna_numbers
+ print est_numbers
+ print dna_letters
+ print est_letters
+
+ align_info.nr_paths_found = nr_paths
+
+ ai = [] ;
+ for i in range(nr_paths):
+ (match, mism, qGapb, tGapb) = pu.evaluate_mmatrix(mmatrix_param[i])
+ if (match +mism +qGapb +tGapb) == 0:
+ #what to do then ...?, don't think, this happpens
+ align_info.nr_paths_found -= 1
+ continue
+
+ (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds) =\
+ pu.get_splice_info(splice_align[i], est_align[i])
+ #print tStarts
+
+ score = alignment_scores[i]
+
+ (exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX) =\
+ pu.comp_identity(dna_letters[i], est_letters[i], t_strand, qStart, tStart, translation, comparison_char)
+
+ first_identity = None
+ last_identity = None
+ for i in range(len(comparison)):
+ if comparison[i] == '|' and first_identity is None: first_identity = i
+ #print i,comparison[i],first_identity,qIDX[i],tIDX[i]
+ if comparison[-i] == '|' and last_identity is None: last_identity = len(comparison) - i - 1
+ #print first_identity, last_identity, len(tIDX), len(qIDX)
+ #print tIDX[first_identity], tIDX[last_identity], qIDX[first_identity], qIDX[last_identity]
+ #print tStart, tEnd, qStart, qEnd
+ #exon_length[0] = exon_length[0] - tIDX[first_identity] - tStart + 2
+ #exon_length[-1] = exon_length[-1] - tEnd + tIDX[last_identity] + 1
+ tStart = tIDX[first_identity]
+ tStarts[0] = tIDX[first_identity]
+ tEnd = tIDX[last_identity]
+ tEnds[-1] = tIDX[last_identity]
+ qStart = qIDX[first_identity]
+ qStarts[0] = qIDX[first_identity]
+ qEnd = qIDX[last_identity]
+ qEnds[-1] = qIDX[last_identity]
+
+ align_info.score = score
+ align_info.match = match
+ align_info.mism = mism
+ align_info.qGapb = qGapb
+ align_info.tGapb = tGapb
+ align_info.qStart = qStart
+ align_info.qEnd = qEnd
+ align_info.tStart = tStart
+ align_info.tEnd = tEnd
+ align_info.num_exons = num_exons
+ align_info.qExonSizes = qExonSizes
+ align_info.qStarts = qStarts
+ align_info.qEnds = qEnds
+ align_info.tExonSizes =tExonSizes
+ align_info.tStarts = tStarts
+ align_info.tEnds = tEnds
+ align_info.exon_length = exon_length
+ align_info.identity = identity
+ align_info.ssQuality = ssQuality
+ align_info.exonQuality = exonQuality
+ align_info.comparison = comparison
+ align_info.qIDX = qIDX
+ align_info.tIDX = tIDX
+ align_info.dna_letters = dna_letters
+ align_info.est_letters = est_letters
+
+ ai.append(align_info)
+
+#<------------------------------------------------------------------------------>
+
+ return ai
--- /dev/null
+# This file was automatically generated by SWIG (http://www.swig.org).
+# Version 1.3.31
+#
+# Don't modify this file, modify the SWIG interface instead.
+# This file is compatible with both classic and new-style classes.
+
+import _alignment_tool
+import new
+new_instancemethod = new.instancemethod
+try:
+ _swig_property = property
+except NameError:
+ pass # Python < 2.2 doesn't have 'property'.
+def _swig_setattr_nondynamic(self,class_type,name,value,static=1):
+ if (name == "thisown"): return self.this.own(value)
+ if (name == "this"):
+ if type(value).__name__ == 'PySwigObject':
+ self.__dict__[name] = value
+ return
+ method = class_type.__swig_setmethods__.get(name,None)
+ if method: return method(self,value)
+ if (not static) or hasattr(self,name):
+ self.__dict__[name] = value
+ else:
+ raise AttributeError("You cannot add attributes to %s" % self)
+
+def _swig_setattr(self,class_type,name,value):
+ return _swig_setattr_nondynamic(self,class_type,name,value,0)
+
+def _swig_getattr(self,class_type,name):
+ if (name == "thisown"): return self.this.own()
+ method = class_type.__swig_getmethods__.get(name,None)
+ if method: return method(self)
+ raise AttributeError,name
+
+def _swig_repr(self):
+ try: strthis = "proxy of " + self.this.__repr__()
+ except: strthis = ""
+ return "<%s.%s; %s >" % (self.__class__.__module__, self.__class__.__name__, strthis,)
+
+import types
+try:
+ _object = types.ObjectType
+ _newclass = 1
+except AttributeError:
+ class _object : pass
+ _newclass = 0
+del types
+
+
+new_intArray = _alignment_tool.new_intArray
+delete_intArray = _alignment_tool.delete_intArray
+intArray_getitem = _alignment_tool.intArray_getitem
+intArray_setitem = _alignment_tool.intArray_setitem
+new_doubleArray = _alignment_tool.new_doubleArray
+delete_doubleArray = _alignment_tool.delete_doubleArray
+doubleArray_getitem = _alignment_tool.doubleArray_getitem
+doubleArray_setitem = _alignment_tool.doubleArray_setitem
+class MyAlignException(_object):
+ __swig_setmethods__ = {}
+ __setattr__ = lambda self, name, value: _swig_setattr(self, MyAlignException, name, value)
+ __swig_getmethods__ = {}
+ __getattr__ = lambda self, name: _swig_getattr(self, MyAlignException, name)
+ __repr__ = _swig_repr
+ def __init__(self, *args):
+ this = _alignment_tool.new_MyAlignException(*args)
+ try: self.this.append(this)
+ except: self.this = this
+ __swig_destroy__ = _alignment_tool.delete_MyAlignException
+ __del__ = lambda self : None;
+MyAlignException_swigregister = _alignment_tool.MyAlignException_swigregister
+MyAlignException_swigregister(MyAlignException)
+
+class penalty(_object):
+ __swig_setmethods__ = {}
+ __setattr__ = lambda self, name, value: _swig_setattr(self, penalty, name, value)
+ __swig_getmethods__ = {}
+ __getattr__ = lambda self, name: _swig_getattr(self, penalty, name)
+ __repr__ = _swig_repr
+ __swig_setmethods__["len"] = _alignment_tool.penalty_len_set
+ __swig_getmethods__["len"] = _alignment_tool.penalty_len_get
+ if _newclass:len = _swig_property(_alignment_tool.penalty_len_get, _alignment_tool.penalty_len_set)
+ __swig_setmethods__["limits"] = _alignment_tool.penalty_limits_set
+ __swig_getmethods__["limits"] = _alignment_tool.penalty_limits_get
+ if _newclass:limits = _swig_property(_alignment_tool.penalty_limits_get, _alignment_tool.penalty_limits_set)
+ __swig_setmethods__["penalties"] = _alignment_tool.penalty_penalties_set
+ __swig_getmethods__["penalties"] = _alignment_tool.penalty_penalties_get
+ if _newclass:penalties = _swig_property(_alignment_tool.penalty_penalties_get, _alignment_tool.penalty_penalties_set)
+ __swig_setmethods__["name"] = _alignment_tool.penalty_name_set
+ __swig_getmethods__["name"] = _alignment_tool.penalty_name_get
+ if _newclass:name = _swig_property(_alignment_tool.penalty_name_get, _alignment_tool.penalty_name_set)
+ __swig_setmethods__["max_len"] = _alignment_tool.penalty_max_len_set
+ __swig_getmethods__["max_len"] = _alignment_tool.penalty_max_len_get
+ if _newclass:max_len = _swig_property(_alignment_tool.penalty_max_len_get, _alignment_tool.penalty_max_len_set)
+ __swig_setmethods__["min_len"] = _alignment_tool.penalty_min_len_set
+ __swig_getmethods__["min_len"] = _alignment_tool.penalty_min_len_get
+ if _newclass:min_len = _swig_property(_alignment_tool.penalty_min_len_get, _alignment_tool.penalty_min_len_set)
+ __swig_setmethods__["transform"] = _alignment_tool.penalty_transform_set
+ __swig_getmethods__["transform"] = _alignment_tool.penalty_transform_get
+ if _newclass:transform = _swig_property(_alignment_tool.penalty_transform_get, _alignment_tool.penalty_transform_set)
+ def __init__(self, *args):
+ this = _alignment_tool.new_penalty(*args)
+ try: self.this.append(this)
+ except: self.this = this
+ __swig_destroy__ = _alignment_tool.delete_penalty
+ __del__ = lambda self : None;
+penalty_swigregister = _alignment_tool.penalty_swigregister
+penalty_swigregister(penalty)
+
+class align_info(_object):
+ __swig_setmethods__ = {}
+ __setattr__ = lambda self, name, value: _swig_setattr(self, align_info, name, value)
+ __swig_getmethods__ = {}
+ __getattr__ = lambda self, name: _swig_getattr(self, align_info, name)
+ __repr__ = _swig_repr
+ __swig_setmethods__["splice_align"] = _alignment_tool.align_info_splice_align_set
+ __swig_getmethods__["splice_align"] = _alignment_tool.align_info_splice_align_get
+ if _newclass:splice_align = _swig_property(_alignment_tool.align_info_splice_align_get, _alignment_tool.align_info_splice_align_set)
+ __swig_setmethods__["est_align"] = _alignment_tool.align_info_est_align_set
+ __swig_getmethods__["est_align"] = _alignment_tool.align_info_est_align_get
+ if _newclass:est_align = _swig_property(_alignment_tool.align_info_est_align_get, _alignment_tool.align_info_est_align_set)
+ __swig_setmethods__["mmatrix_param"] = _alignment_tool.align_info_mmatrix_param_set
+ __swig_getmethods__["mmatrix_param"] = _alignment_tool.align_info_mmatrix_param_get
+ if _newclass:mmatrix_param = _swig_property(_alignment_tool.align_info_mmatrix_param_get, _alignment_tool.align_info_mmatrix_param_set)
+ __swig_setmethods__["alignment_scores"] = _alignment_tool.align_info_alignment_scores_set
+ __swig_getmethods__["alignment_scores"] = _alignment_tool.align_info_alignment_scores_get
+ if _newclass:alignment_scores = _swig_property(_alignment_tool.align_info_alignment_scores_get, _alignment_tool.align_info_alignment_scores_set)
+ __swig_setmethods__["alignment_length"] = _alignment_tool.align_info_alignment_length_set
+ __swig_getmethods__["alignment_length"] = _alignment_tool.align_info_alignment_length_get
+ if _newclass:alignment_length = _swig_property(_alignment_tool.align_info_alignment_length_get, _alignment_tool.align_info_alignment_length_set)
+ __swig_setmethods__["aligned_dna"] = _alignment_tool.align_info_aligned_dna_set
+ __swig_getmethods__["aligned_dna"] = _alignment_tool.align_info_aligned_dna_get
+ if _newclass:aligned_dna = _swig_property(_alignment_tool.align_info_aligned_dna_get, _alignment_tool.align_info_aligned_dna_set)
+ __swig_setmethods__["aligned_est"] = _alignment_tool.align_info_aligned_est_set
+ __swig_getmethods__["aligned_est"] = _alignment_tool.align_info_aligned_est_get
+ if _newclass:aligned_est = _swig_property(_alignment_tool.align_info_aligned_est_get, _alignment_tool.align_info_aligned_est_set)
+ def __init__(self, *args):
+ this = _alignment_tool.new_align_info(*args)
+ try: self.this.append(this)
+ except: self.this = this
+ __swig_destroy__ = _alignment_tool.delete_align_info
+ __del__ = lambda self : None;
+align_info_swigregister = _alignment_tool.align_info_swigregister
+align_info_swigregister(align_info)
+
+myalign = _alignment_tool.myalign
+def createDoubleArrayFromList(list):
+ array = new_doubleArray(len(list))
+ for i in range(len(list)):
+ doubleArray_setitem(array, i, list[i])
+ return array
+
+def createListFromIntArray(array, array_len):
+ list = [0]*array_len
+ for i in range(array_len):
+ list[i] = intArray_getitem(array,i)
+ return list
+
+def createListFromDoubleArray(array, array_len):
+ list = [0]*array_len
+ for i in range(array_len):
+ list[i] = doubleArray_getitem(array,i)
+ return list
+
+
+
+
--- /dev/null
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2006-2007 Soeren Sonnenburg
+# Written (W) 2006-2007 Mikio Braun
+# Copyright (C) 2007 Fraunhofer Institute FIRST and Max-Planck-Society
+#
+
+import time
+from string import maketrans
+
+""" this function is 100% compatible to the matlab function, thus it is one based (!)
+ use one_based=False if needed, then however the interval is [start,stop) (excluding stop)
+"""
+def load_genomic(chromosome, strand, start, stop, genome, one_based=True):
+ fname = '/fml/ag-raetsch/share/databases/genomes/' + genome + '/' + chromosome[3:] + '.flat'
+ f=file(fname)
+ if one_based:
+ f.seek(start-1)
+ str=f.read(stop-start+1)
+ else:
+ f.seek(start)
+ str=f.read(stop-start)
+
+ if strand=='-':
+ return reverse_complement(str)
+ elif strand=='+':
+ return str
+ else:
+ print 'strand must be + or -'
+ raise KeyError
+
+""" read a table browser ascii output file (http://genome.ucsc.edu/cgi-bin/hgTables) """
+def read_table_browser(f):
+ table=dict();
+ for l in f.readlines():
+ if not l.startswith('#'):
+ (name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,proteinID,alignID)=l.split('\t')
+ exonStarts=[ int(i) for i in exonStarts.split(',')[:-1] ]
+ exonEnds=[ int(i) for i in exonEnds.split(',')[:-1] ]
+
+ table[name]={ 'chrom': chrom, 'strand': strand, 'txStart': int(txStart), 'txEnd': int(txEnd),
+ 'cdsStart': int(cdsStart), 'cdsEnd': int(cdsEnd), 'exonCount': int(exonCount), 'exonStarts': exonStarts,
+ 'exonEnds': exonEnds, 'proteinID': proteinID, 'alignID': alignID[:-1] }
+
+ return table
+
+""" get promoter region """
+def get_promoter_region(chromosome, strand, gene_start, gene_end, genome, length):
+
+ if strand == '+':
+ return load_genomic(chromosome, strand, gene_start, gene_start+length, genome, one_based=False)
+ elif strand == '-':
+ return load_genomic(chromosome, strand, gene_end, gene_end+length, genome, one_based=False)
+ else:
+ print 'unknown strand'
+ return None
+
+""" reverse + complement a DNA sequence (only letters ACGT are translated!)
+ FIXME won't work with all the rest like y... """
+def reverse_complement(str):
+ t=maketrans('acgtACGT','tgcaTGCA')
+ return str[len(str)::-1].translate(t)
+
+""" works only with .fa files that contain a single entry """
+def read_single_fasta(fname):
+ str=file(fname).read()
+ str=str[str.index('\n')+1:].replace('\n','')
+ return str
+
+""" writes only single enty .fa files """
+def write_single_fasta(fname, name, str, linelen=60):
+ header= '>' + name + '\n'
+ f=file(fname,'a')
+ f.write(header)
+ for i in xrange(0,len(str),linelen):
+ f.write(str[i:i+linelen]+'\n')
+ f.close()
+
+""" read fasta as dictionary """
+def read_fasta(f, fasta=dict(), id_only=True):
+ import numpy
+ str=f.read()
+ idx = str.find('>')
+ #if idx==-1:
+ if 0: # no support for contig list files -> would need extra blastn workaround
+ # file name list?
+ files = str.split('\n') ;
+ for s in files:
+ if len(s)==0: continue
+ print s.strip() + '\n'
+ fasta = read_fasta(file(s.strip()), fasta)
+ else:
+ # real fasta file?
+ sequences = str.split('>')
+ for s in sequences:
+ if len(s)==0: continue
+ header = s[0:s.index('\n')]
+ if id_only:
+ header = header.split(' ')[0] ;
+ header = header.split('\t')[0] ;
+ seq = s[s.index('\n')+1:].replace('\n','').upper()
+ #print 'has_key', fasta.has_key(header),header
+ assert(not fasta.has_key(header))
+ fasta[header]=seq ;
+
+ return fasta
+
+""" write dictionary fasta """
+def write_fasta(f, d, linelen=60):
+ for k in sorted(d):
+ f.write('>%s\n' % k);
+ s = d[k]
+ for i in xrange(0, len(s), linelen):
+ f.write(s[i:i+linelen] + '\n')
+
+def write_gff(f, (source, version), (seqtype, seqname), descrlist, skipheader=False):
+ """ writes a gff version 2 file
+ descrlist is a list of dictionaries, each of which contain these fields:
+ <seqname> <source> <feature> <start> <end> <score> <strand> <frame> [attributes] [comments]
+ """
+
+ if not skipheader:
+ f.write('##gff-version 2\n')
+ f.write('##source-version %s %s\n' % (source, version) )
+
+ t=time.localtime()
+ f.write("##date %d-%d-%d %d:%d:%d\n" % t[0:6])
+
+ f.write('##Type %s %s\n' % (seqtype, seqname) )
+
+ for d in descrlist:
+ f.write('%s\t%s\t%s\t%d\t%d\t%f\t%s\t%d' % (d['seqname'], d['source'],
+ d['feature'], d['start'], d['end'],
+ d['score'], d['strand'], d['frame']))
+ if d.has_key('attributes'):
+ f.write('\t' + d['attributes'])
+ if d.has_key('comments'):
+ f.write('\t' + d['comments'])
+ f.write('\n')
+
+
+if __name__ == '__main__':
+ import sys,os
+
+ table=read_table_browser(file('/fml/ag-raetsch/home/sonne/addnet/tfbs/share/data/wt1_bibliosphere_table_browser_hg17.txt'))
+ print table.keys()
+ print table[table.keys()[0]]
+ d = { 'ahoernchen' : 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT',
+ 'bhoernchen' : 'GATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACA' }
+
+ write_fasta(sys.stdout, d)
+ write_fasta(file('/tmp/test.fa','w'), d)
+
+ d2 = read_fasta(file('/tmp/test.fa'))
+ os.unlink('/tmp/test.fa')
+
+ print d
+ print d2
+ print d == d2
+
+ p=load_genomic('chr5', '+', 100000, 100100,'hg17')
+ n=load_genomic('chr1', '-', 3000000, 3001000,'mm7')
+ write_single_fasta('bla.fa','bla', 'ACGT')
+ n2=read_single_fasta('bla.fa')
--- /dev/null
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2006-2007 Soeren Sonnenburg
+# Written (W) 2007 Gunnar Raetsch
+# Copyright (C) 2007 Fraunhofer Institute FIRST and Max-Planck-Society
+#
+
+import sys
+import numpy
+from numpy import mat,array,chararray,inf
+
+class model(object):
+ #acceptor
+ acceptor_bins=None
+ acceptor_limits=None
+ acceptor_penalties=None
+
+ acceptor_splice_b=None
+ acceptor_splice_order=None
+ acceptor_splice_window_left=None
+ acceptor_splice_window_right=None
+ acceptor_splice_alphas=None
+ acceptor_splice_svs=None
+
+ #donor
+ donor_bins=None
+ donor_limits=None
+ donor_penalties=None
+
+ donor_splice_b=None
+ donor_splice_order=None
+ #donor_splice_use_gc=None
+ donor_splice_window_left=None
+ donor_splice_window_right=None
+ donor_splice_alphas=None
+ donor_splice_svs=None
+
+ # intron length
+ intron_len_bins=None
+ intron_len_limits=None
+ intron_len_penalties=None
+ intron_len_min=None
+ intron_len_max=None
+ intron_len_transform=None
+
+ gene_len_max = None
+
+ # substitution matrix
+ substitution_matrix=None
+
+def parse_file(file):
+ m=model()
+
+ sys.stdout.write("loading model file"); sys.stdout.flush()
+
+ l=file.readline();
+
+ if l != '%palma definition file version: 1.0\n':
+ sys.stderr.write("\nfile not a palma definition file\n")
+ print l
+ return None
+
+ while l:
+ if not ( l.startswith('%') or l.startswith('\n') ): # comment
+ #print l[0:100]
+
+ #acceptor
+ if m.acceptor_bins is None: m.acceptor_bins=parse_value(l, 'acceptor_bins')
+ if m.acceptor_limits is None: m.acceptor_limits=parse_vector(l, file, 'acceptor_limits')
+ if m.acceptor_penalties is None: m.acceptor_penalties=parse_vector(l, file, 'acceptor_penalties') #DEAD
+
+ if m.acceptor_splice_b is None: m.acceptor_splice_b=parse_value(l, 'acceptor_splice_b')
+ if m.acceptor_splice_order is None: m.acceptor_splice_order=parse_value(l, 'acceptor_splice_order')
+ if m.acceptor_splice_window_left is None: m.acceptor_splice_window_left=parse_value(l, 'acceptor_splice_window_left')
+ if m.acceptor_splice_window_right is None: m.acceptor_splice_window_right=parse_value(l, 'acceptor_splice_window_right')
+ if m.acceptor_splice_alphas is None: m.acceptor_splice_alphas=parse_vector(l, file, 'acceptor_splice_alphas')
+ if m.acceptor_splice_svs is None: m.acceptor_splice_svs=parse_string(l, file, 'acceptor_splice_svs')
+
+ #donor
+ if m.donor_bins is None: m.donor_bins=parse_value(l, 'donor_bins')
+ if m.donor_limits is None: m.donor_limits=parse_vector(l, file, 'donor_limits')
+ if m.donor_penalties is None: m.donor_penalties=parse_vector(l, file, 'donor_penalties') #DEAD
+
+ if m.donor_splice_b is None: m.donor_splice_b=parse_value(l, 'donor_splice_b')
+ if m.donor_splice_order is None: m.donor_splice_order=parse_value(l, 'donor_splice_order')
+ #if m.donor_splice_use_gc is None: m.donor_splice_use_gc=parse_value(l, 'donor_splice_use_gc')
+ if m.donor_splice_window_left is None: m.donor_splice_window_left=parse_value(l, 'donor_splice_window_left')
+ if m.donor_splice_window_right is None: m.donor_splice_window_right=parse_value(l, 'donor_splice_window_right')
+ if m.donor_splice_alphas is None: m.donor_splice_alphas=parse_vector(l, file, 'donor_splice_alphas')
+ if m.donor_splice_svs is None: m.donor_splice_svs=parse_string(l, file, 'donor_splice_svs')
+
+
+ # intron length
+ if m.intron_len_bins is None: m.intron_len_bins=parse_value(l, 'intron_len_bins')
+ if m.intron_len_limits is None: m.intron_len_limits=parse_vector(l, file, 'intron_len_limits')
+ if m.intron_len_penalties is None: m.intron_len_penalties=parse_vector(l, file, 'intron_len_penalties')
+ if m.intron_len_min is None: m.intron_len_min=parse_value(l, 'intron_len_min')
+ if m.intron_len_max is None: m.intron_len_max=parse_value(l, 'intron_len_max')
+ if m.intron_len_transform is None: m.intron_len_transform=parse_value(l, 'intron_len_transform')
+
+ # gene length
+ if m.gene_len_max is None: m.gene_len_max=parse_value(l, 'gene_len_max')
+
+ if m.substitution_matrix is None: m.substitution_matrix=parse_vector(l, file, 'substitution_matrix')
+
+ l=file.readline()
+
+ sys.stderr.write('done\n')
+ return m
+
+def parse_value(line, name):
+ if (line.startswith(name)):
+ #print 'found ' + name
+ sys.stdout.write('.'); sys.stdout.flush()
+ str = line[line.find('=')+1:-1] ;
+ if str[0] == "'":
+ return str[1:-1]
+ else:
+ #print float(str)
+ return float(str)
+ else:
+ return None
+
+def parse_vector(line, file, name):
+ mat = parse_matrix(line, file, name)
+ if mat is None:
+ return mat
+ else:
+ mat = numpy.array(mat).flatten()
+ return mat
+
+def parse_matrix(line, file, name):
+ if (line.startswith(name)):
+ sys.stdout.write('.'); sys.stdout.flush()
+ if line.find(']') < 0:
+ l=''
+ while l is not None and l.find(']') < 0:
+ line+=l
+ l=file.readline()
+ if l is not None and l.find(']') >= 0:
+ line+=l
+
+ if line.find(']') < 0:
+ sys.stderr.write("matrix `" + name + "' ended without ']'\n")
+ return None
+ else:
+ return mat(line[line.find('['):line.find(']')+1])
+ else:
+ return None
+
+def parse_string(line, file, name):
+ if (line.startswith(name)):
+ sys.stdout.write('.'); sys.stdout.flush()
+ l=''
+ lines=[]
+ while l is not None and l.find(']') < 0:
+ if l:
+ lines+=[list(l[:-1])]
+ l=file.readline()
+
+ if l.find(']') < 0:
+ sys.stderr.write("string ended without ']'\n")
+ return None
+ else:
+ #seqlen=len(lines[0])
+ num=len(lines)
+ #trdat = chararray((seqlen,num),1,order='FORTRAN')
+ #for i in xrange(num):
+ # trdat[:,i]=lines[i]
+ trdat = num*[[]]
+ for idx,example in enumerate(lines):
+ trdat[idx] = ''.join(example)
+ return trdat
+ else:
+ return None
+
+if __name__ == '__main__':
+ import bz2
+ import sys
+ #import hotshot, hotshot.stats
+
+ def load():
+ #f=bz2.BZ2File('../../splicing/msplicer/standalone/python/data/msplicer_elegansWS120_gc=1_orf=0.dat.bz2');
+ filename='test.dat.bz2'
+ if filename.endswith('.bz2'):
+ f=bz2.BZ2File(filename);
+ else:
+ f=file(filename);
+
+ m=parse_file(f);
+
+ print m.acceptor_bins is None
+ print m.acceptor_limits is None
+ print m.acceptor_penalties is None
+
+ print m.acceptor_splice_b is None
+ print m.acceptor_splice_order is None
+ print m.acceptor_splice_window_left is None
+ print m.acceptor_splice_window_right is None
+ print m.acceptor_splice_alphas is None
+ print m.acceptor_splice_svs is None
+
+ print m.donor_bins is None
+ print m.donor_limits is None
+ print m.donor_penalties is None
+
+ print m.donor_splice_b is None
+ print m.donor_splice_order is None
+ #print m.donor_splice_use_gc is None
+ print m.donor_splice_window_left is None
+ print m.donor_splice_window_right is None
+ print m.donor_splice_alphas is None
+ print m.donor_splice_svs is None
+ print m.intron_len_bins is None
+ print m.intron_len_limits is None
+ print m.intron_len_penalties is None
+ print m.intron_len_min is None
+ print m.intron_len_max is None
+ print m.intron_len_transform is None
+
+ print m.substitution_matrix is None
+
+ print m.intron_len_transform
+ print m.substitution_matrix
+
+ load()
+
+ #prof = hotshot.Profile("model.prof")
+ #benchtime = prof.runcall(load)
+ #prof.close()
+ #stats = hotshot.stats.load("model.prof")
+ #stats.strip_dirs()
+ #stats.sort_stats('time', 'calls')
+ #stats.print_stats(20)
--- /dev/null
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2006-2007 Uta Schulze, Gunnar Raetsch
+# Copyright (C) 2006-2007 Max-Planck-Society
+#
+
+import pdb
+
+def print_line(score, match, mism, qGapb, tGapb, strand, tStrand, qName, qSize, qStart, qEnd,\
+ tName, tSize, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
+ tExonSizes, tStarts, tEnds, exon_length, identity, out_file):
+ """Produces a tab separated line in the out_file which describes the alignment.
+
+ 1. align. score: palma alignment score
+ 2. match: number of matches in the local alignment
+ 3. mismatch: number of mismatches in the local alignment
+ 4. Q gap bases: number of gaps on the target sequence (insert in query acc. to BLAT)
+ 5. T gap bases: number of gaps on the query sequence (insert in target acc. to BLAT)
+ (introns, which are basically gaps on the query, are not included!)
+ 6. strand: '+' or '-' for query strand (second strand for target if provided)
+ if you align two "short" sequences and no whole genome search, the
+ strand equals "+" by default and logic.
+ 7. Q name: first word in header of the entry in the query-file.
+ 8. Q size: length of the sequence of the entry in the query-file
+ 9. Q start: alignment start position on query sequence
+ 10. Q end: alignment end position on query sequence
+ 11. T name: first word in header of the entry in the target-file.
+ 12. T size: length of the sequence of the entry in the target-file
+ 13. T start: alignment start position on target sequence
+ 14. T end: alignment end position on target sequence
+ 15. exon count: number of exons in the alignment. Different to BLAT, where each block is
+ defined by the surrounding gaps, exons are defined by adjacent introns.
+ Exons can contain gaps. The length of the without gaps on both target
+ and query will most likely be different.
+ 16. Q bound: Exact beginning and end of each exon on the query, without showing the gaps.
+ E.g. "ex1_start-ex1_end,ex2_start-ex2_end,..,"
+ Beginning of first exon most likely equals Q start, except the alignment
+ begins with an intron. Analog with Q end for the end of the last exon.
+ 17. T bound: Exact beginning and end of each exon on the query, without showing the gaps.
+ Beginning of first exon most likely equals T start, except the alignment
+ begins with an intron. Analog with T end for the end of the last exon.
+ 18. exon lengths: These are actually the length of the exons in the alignment, where gaps are
+ included.
+ 19. identity: Number of matches divided by length of exon on alignment times 100.
+ If the exon is perfect, and there are no gaps or mismatches, it's 100%.
+ """
+
+ qExonSizes_string = ''
+ for i in qExonSizes:
+ qExonSizes_string = qExonSizes_string + ('%d,' %i)
+ tExonSizes_string = ''
+ for i in tExonSizes:
+ tExonSizes_string = tExonSizes_string + ('%d,' %i)
+ qBounds_string = ''
+ for (i,j) in zip(qStarts, qEnds):
+ qBounds_string = qBounds_string + ('%d-%d,' %(i,j))
+ tBounds_string = ''
+ for (i,j) in zip(tStarts, tEnds):
+ tBounds_string = tBounds_string + ('%d-%d,' %(i,j))
+ exon_length_string = ''
+ for i in exon_length:
+ exon_length_string = exon_length_string + ('%d,' %i)
+ identity_string = ''
+ for i in identity:
+ identity_string = identity_string + ('%1.1f,' %(i*100))
+
+ if tStrand == '+':
+ out_file.write("%4.3f\t%d\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\n"\
+ %(score, match, mism, qGapb, tGapb, strand, qName, qSize, qStart,\
+ qEnd, tName, tSize, tStart, tEnd, num_exons, qBounds_string,\
+ tBounds_string, exon_length_string, identity_string))
+ if tStrand == '-':
+ out_file.write("%4.3f\t%d\t%d\t%d\t%d\t%s%s\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\n"\
+ %(score, match, mism, qGapb, tGapb, strand, tStrand, qName, qSize, qStart,\
+ qEnd, tName, tSize, tStart, tEnd, num_exons, qBounds_string,\
+ tBounds_string, exon_length_string, identity_string))
+ return
+#<------------------------------------------------------------------------------>
+
+#<------------------------------------------------------------------------------>
+# prints a line in the output_file which describes the alignment, in blat psl format
+#<------------------------------------------------------------------------------>
+def print_line_blatformat(score, match, mism, qGapb, tGapb, qStrand, tStrand, qName, qSize, qStart, qEnd,\
+ tName, tSize, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
+ tExonSizes, tStarts, tEnds, exon_length, identity, ssQuality, out_file):
+ """
+ matches int unsigned , # Number of bases that match (in BLAT this excludes repeats, not here)
+ misMatches int unsigned , # Number of bases that don't match
+ repMatches int unsigned , # Number of bases that match but are part of repeats (always zero here)
+ nCount int unsigned , # Number of 'N' bases (always zero here)
+ qNumInsert int unsigned , # Number of inserts in query (length of tStarts)
+ qBaseInsert int unsigned , # Number of bases inserted in query
+ tNumInsert int unsigned , # Number of inserts in target (length of qStarts)
+ tBaseInsert int unsigned , # Number of bases inserted in target
+ strand char(2) , # + or - for query strand, optionally followed by + or - for target strand
+ qName varchar(255) , # Query sequence name
+ qSize int unsigned , # Query sequence size
+ qStart int unsigned , # Alignment start position in query
+ qEnd int unsigned , # Alignment end position in query
+ tName varchar(255) , # Target sequence name
+ tSize int unsigned , # Target sequence size
+ tStart int unsigned , # Alignment start position in target
+ tEnd int unsigned , # Alignment end position in target
+ blockCount int unsigned , # Number of blocks in alignment. A block contains no gaps.
+ blockSizes longblob , # Size of each block in a comma separated list
+ qStarts longblob , # Start of each block in query in a comma separated list
+ tStarts longblob , # Start of each block in target in a comma separated list
+ """
+
+ if tStrand == '-':
+ exon_length.reverse()
+ qEnds.reverse()
+ qStarts = map(lambda x: qSize - x + 1, qEnds)
+ tStarts.reverse()
+ tEnds.reverse()
+ #if strand == '-':
+ # qStarts.reverse()
+ # qEnds.reverse()
+ if True:
+ for i in range(len(qStarts)):
+ if i>0: qStarts[i]=qStarts[i-1]+exon_length[i-1]
+
+ repMatches = 0
+ nCount = 0
+ qStarts_string = ''
+ for i in qStarts:
+ assert(i>0)
+ qStarts_string = qStarts_string + ('%d,' %(i-1))
+ tStarts_string = ''
+ #if len(tStarts)>0:
+ # tStarts_string = tStarts_string + ('%d,' %(tStarts[0]))
+ for i in tStarts:
+ assert(i>0)
+ tStarts_string = tStarts_string + ('%d,' %(i-1))
+ exon_length_string = ''
+ for i in exon_length:
+ exon_length_string = exon_length_string + ('%d,' %i)
+ ssQuality_string = ''
+ for i in ssQuality:
+ ssQuality_string = ssQuality_string + ('%1.1f,' %(i*100))
+ identity_string = ''
+ for i in identity:
+ identity_string = identity_string + ('%1.1f,' %(i*100))
+
+ out_file.write('%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t'\
+ %(match, mism, repMatches, nCount, len(tStarts), qGapb, len(qStarts), tGapb))
+ #out_file.write('%c%c\t' %(tStrand,qStrand))
+ out_file.write('%c\t' %(tStrand))
+
+ out_file.write('%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\n'\
+ %(qName, qSize, qStart-1, qEnd, tName, tSize, tStart-1, tEnd, \
+ num_exons, exon_length_string, qStarts_string, tStarts_string, identity_string, ssQuality_string))
+
+ return
+#<------------------------------------------------------------------------------>
+
+
+#<------------------------------------------------------------------------------>
+# prints the alignment, should be clear (hehe)
+# len(DNA) == len(EST)
+#<------------------------------------------------------------------------------>
+def print_align(DNA, EST, comparison, qIDX, tIDX, out_file, width=60):
+ import numpy
+ idx = []
+ idx_old = []
+ for i in range(1,len(DNA),width): #e.g. 1, 61, 121 ...
+ if i == 1:
+ idx = [i,numpy.minimum(i+width-1,len(DNA))]
+ out_file.write("Q: %9i %s %9i\n" %(qIDX[idx[0]-1], EST[idx[0]-1:idx[1]], qIDX[idx[1]-1]))
+ out_file.write(" %s \n" %(comparison[idx[0]-1:idx[1]]))
+ out_file.write("T: %9i %s %9i\n\n\n" %(tIDX[idx[0]-1], DNA[idx[0]-1:idx[1]], tIDX[idx[1]-1]))
+ else:
+ idx_old = idx
+ idx = [i,numpy.minimum(i+width-1,len(DNA))]
+ out_file.write("Q: %9i %s %9i\n" %(min(qIDX[idx_old[1]-1]+1,qIDX[idx[1]-1]), EST[idx[0]-1:idx[1]], qIDX[idx[1]-1]))
+ out_file.write(" %s \n" %(comparison[idx[0]-1:idx[1]]))
+ out_file.write("T: %9i %s %9i\n\n\n" %(min(tIDX[idx_old[1]-1]+1,tIDX[idx[1]-1]), DNA[idx[0]-1:idx[1]], tIDX[idx[1]-1]))
+ return
+
+
+def print_header(out_file):
+ """prints some lines in output_file that describe what each column means
+
+ Can be turned off via --noHead.
+ """
+ out_file.write("align.\tmatch\tmis-\tQ gap\tT gap\tstrand\tQ\tQ\tQ\tQ\tT\tT\tT\tT\texon\tQ\tT\texon\tidentity\n")
+ out_file.write("score\t\tmatch\tbases\tbases\t\tname\tsize\tstart\tend\tname\tsize\tstart\tend\tcount\tbound\tbound\tlengths\n")
+ out_file.write("--------------------------------------------------------------------------------------------------------------------------------------------------------\n")
+ return
+
+def print_blat_header(out_file):
+ """prints blat header lines in output_file that describe what each column means
+
+ Copied from blat output. Can be turned off via --noHead.
+ """
+ out_file.write('psLayout version 3\n\n')
+ out_file.write("match\tmis-\trep.\tN's\tQ gap\tQ gap\tT gap\tT gap\tstrand\tQ\t\tQ\tQ\tQ\tT\t\tT\tT\tT\tblock\tblockSizes\tqStarts\t tStarts\n")
+ out_file.write('\tmatch\tmatch\t\tcount\tbases\tcount\tbases\t\tname\t\tsize\tstart\tend\tname\t\tsize\tstart\tend\tcount\n')
+ out_file.write('---------------------------------------------------------------------------------------------------------------------------------------------------------------\n')
+
+def print_bed_header(bed_file):
+ """prints some lines in bed_file that describe what each column means
+
+ Can be turned off via --noHead.
+ """
+ bed_file.write("chrom\tchrom\tchrom\tname\tscore\tstrand\tblock\tblock\tblock\n")
+ bed_file.write("\tStart\tEnd\t\t\t\tCount\tSizes\tStart\n")
+ bed_file.write("---------------------------------------------------------------------\n")
+ return
+
+
+
+#<------------------------------------------------------------------------------>
+# prints a line in the output bed_file which describes the alignment
+#<------------------------------------------------------------------------------>
+def print_bed_line(tName, tStart, tEnd, qName, score, tStrand,\
+ num_exons, tStarts, tEnds, bed_file):
+
+ tStarts_string = ''
+ tExonSizes_string = ''
+ for (i,j) in zip(tStarts, tEnds):
+ tStarts_string = tStarts_string + ('%d,' %(i))
+ tExonSizes_string = tExonSizes_string + ('%d,' %(j-i+1))
+
+ bed_file.write("%s\t%d\t%d\t%s\t%4.3f\t%s\t%d\t%s\t%s\n"\
+ %(tName, tStart, tEnd, qName, score, tStrand,\
+ num_exons, tExonSizes_string, tStarts_string))
+ return
+#<------------------------------------------------------------------------------>
+
+
+def print_results(align_info, out_file, bed_file, tName, tSize, qName, qSize, qStrand, tStrand, parameters):
+ """printing results in output_file"""
+
+ score = align_info.score
+ match = align_info.match
+ mism = align_info.mism
+ qGapb = align_info.qGapb
+ tGapb = align_info.tGapb
+
+ tStart = align_info.tStart
+ tEnd = align_info.tEnd
+ tStarts = align_info.tStarts
+ tEnds = align_info.tEnds
+ tIDX = align_info.tIDX
+
+ qStart = align_info.qStart
+ qEnd = align_info.qEnd
+ qStarts = align_info.qStarts
+ qEnds = align_info.qEnds
+ qIDX = align_info.qIDX
+ #if tStrand=='+':
+ # tStart = align_info.tStart
+ # tEnd = align_info.tEnd
+ # tStarts = align_info.tStarts
+ # tEnds = align_info.tEnds
+ # tIDX = align_info.tIDX
+ #else:
+ # tEnd = tSize - align_info.tStart + 1
+ # tStart = tSize - align_info.tEnd + 1
+ # tStarts = map(lambda x: tSize - x + 1, align_info.tStarts)
+ # tEnds = map(lambda x: tSize - x + 1, align_info.tEnds)
+ # tIDX = map(lambda x: tSize - x + 1, align_info.tIDX)
+
+ #if qStrand=='+':
+ # qStart = align_info.qStart
+ # qEnd = align_info.qEnd
+ # qStarts = align_info.qStarts
+ # qEnds = align_info.qEnds
+ # qIDX = align_info.qIDX
+ #else:
+ # qEnd = qSize - align_info.qStart + 1
+ # qStart = qSize - align_info.qEnd + 1
+ # qStarts = map(lambda x: qSize - x + 1, align_info.qStarts)
+ # qEnds = map(lambda x: qSize - x + 1, align_info.qEnds)
+ # qIDX = map(lambda x: qSize - x + 1, align_info.qIDX)
+
+ num_exons = align_info.num_exons
+ qExonSizes = align_info.qExonSizes
+ tExonSizes = align_info.tExonSizes
+ exon_length = align_info.exon_length
+ identity = align_info.identity
+ ssQuality = align_info.ssQuality
+ exonQuality = align_info.exonQuality
+ comparison = align_info.comparison
+ dna_letters = align_info.dna_letters
+ est_letters = align_info.est_letters
+
+ if parameters['print_psl2']:
+ print_line(score, match, mism, qGapb, tGapb, qStrand, tStrand, qName, qSize, qStart, qEnd,\
+ tName, tSize, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
+ tExonSizes, tStarts, tEnds, exon_length, identity, ssQuality, out_file)
+ else:
+ print_line_blatformat(score, match, mism, qGapb, tGapb, qStrand, tStrand, qName, qSize, qStart, qEnd,\
+ tName, tSize, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
+ tExonSizes, tStarts, tEnds, exon_length, exonQuality, ssQuality, out_file)
+
+ if parameters['print_sequences']:
+ out_file.write("\n")
+ print_align(dna_letters[0], est_letters[0], comparison, qIDX, tIDX, out_file)
+
+ if print_header and bed_file is not None:
+ print_bed_header(bed_file)
+ print_bed_line(tName, tStart, tEnd, qName, score, tStrand, num_exons, tStarts, tEnds, bed_file)
+
--- /dev/null
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2006-2007 Uta Schulze, Gunnar Raetsch
+# Copyright (C) 2006-2007 Max-Planck-Society
+#
+
+def set_param(model):
+ import numpy
+
+ class plf: #means piecewise linear function
+ len = 0
+ limits = []
+ penalties = []
+ transform = ''
+ name = ''
+ max_len = 0
+ min_len = 0
+
+ h = plf()
+ h.len = int(model.intron_len_bins) ;
+ h.limits = model.intron_len_limits ;
+ h.penalties = model.intron_len_penalties ;
+ h.name = 'h'
+ h.max_len = int(model.intron_len_max)
+ h.min_len = int(model.intron_len_min)
+ h.transform = model.intron_len_transform
+
+ d = plf()
+ d.len = int(model.donor_bins)
+ d.limits = model.donor_limits
+ d.penalties = model.donor_penalties
+ d.name = 'd'
+ d.max_len = 100
+ d.min_len = -100
+
+ a = plf()
+ a.len = int(model.acceptor_bins)
+ a.limits = model.acceptor_limits
+ a.penalties = model.acceptor_penalties
+ a.name = 'a'
+ a.max_len = 100
+ a.min_len = -100
+
+ mmatrix = model.substitution_matrix
+ return h,d,a,mmatrix
+#<------------------------------------------------------------------------------>
+
+
+#<------------------------------------------------------------------------------>
+# gives back y-value acc, to "support"-value (piecewise linear function)
+#<------------------------------------------------------------------------------>
+def penalty_lookup(plf, supp_value):
+ import numpy
+ limits = plf.limits
+ penalties = plf.penalties
+ transform = plf.transform
+
+ if transform == '':
+ supp_value = supp_value
+ elif transform == 'log':
+ print 'log'
+ supp_value = numpy.log(supp_value)
+ elif transform == 'log(+3)':
+ print 'log(+3)'
+ supp_value = numpy.log(supp_value+3)
+ elif transform == 'log(+1)':
+ print 'log(+1)'
+ supp_value = numpy.log(supp_value+1)
+ elif transform == '(+3)':
+ print '(+3)'
+ supp_value = supp_value+3
+ elif transform == 'mod3':
+ print 'mod3'
+ supp_value = numpy.mod(supp_value,3)
+ else:
+ raise NameError, 'unknown penalty transform'
+
+ #print 'supp_value = %i\n' %supp_value
+
+ if supp_value <= limits[0]:
+ score = penalties[0]
+ #print 'supp_value <= limits[0]'
+ elif supp_value >= limits[plf.len-1]:
+ score = penalties[plf.len-1]
+ #print 'supp_value >= limits[len(limits)-1]'
+ else:
+ for cnt in range(plf.len):
+ if supp_value <= limits[cnt]:
+ upper_idx = cnt
+ break
+ lower_idx = upper_idx-1
+ #print '%1.3f <= %1.3f <= %1.3f\n' %(limits[lower_idx], supp_value, limits[upper_idx])
+ score = ((supp_value-limits[lower_idx])*penalties[upper_idx] + (limits[upper_idx]-supp_value)*penalties[lower_idx])/(limits[upper_idx]-limits[lower_idx])
+
+ return score
+#<------------------------------------------------------------------------------>
+
+
+#<------------------------------------------------------------------------------>
+# counts matches, mismatches, gaps on DNA (means bases inserted in query, qGapb)
+# and gaps on EST (means baes inserted in transcript, tGapb)
+#
+# matchmatrix (columnwise)
+# - A C G T N (dna)
+# - x x x x x x
+# A x x x x x x
+# C x x x x x x
+# G x x x x x x
+# T x x x x x x
+# N x x x x x x
+# (est)
+#<------------------------------------------------------------------------------>
+def evaluate_mmatrix(mmatrix):
+ import numpy
+ #7: [a,a], 14:[c,c], 21:[g,g], 28:[t,t], 35:[n.n]
+ match = mmatrix[7] + mmatrix[14] + mmatrix[21] + mmatrix[28] + mmatrix[35]
+ qGapb = numpy.sum(mmatrix[1:6]) #gaps on DNA
+ tGapb = mmatrix[6] + mmatrix[12] + mmatrix[18] + mmatrix[24] + mmatrix[30] #gaps on EST
+ mism = numpy.sum(mmatrix) - (match + qGapb + tGapb)
+
+ if 0:
+ print 'matches: %d' %match
+ print 'mismatches: %d' %mism
+ print 'qGapb (gaps on DNA): %d' %qGapb
+ print 'tGapb (gaps on EST): %d' %tGapb
+
+ return match, mism, qGapb, tGapb
+#<------------------------------------------------------------------------------>
+
+
+#<------------------------------------------------------------------------------>
+# qStart, qEnd, tStart, tEnd, blocks, qStarts, tStarts
+#<------------------------------------------------------------------------------>
+def get_splice_info(splice_align, est_align):
+ import sys
+ #problem: because of the dangling ends, alignment can start with an intron
+
+ #print splice_align
+ #print est_align
+
+ dna_len = len(splice_align)
+ est_len = len(est_align)
+ qStart = 1 #(default)
+ qEnd = est_len #(default)
+ tStart = 1 #(default)
+ tEnd = dna_len #(default)
+ qStarts = []
+ qEnds = []
+ qExonSizes = []
+ tStarts = []
+ tEnds = []
+ tExonSizes = []
+
+ #<---------------------------------------------------------->
+ #query: dangling start
+ if est_align[0]==4:
+ for i in range(est_len):
+ if est_align[i]!=4:
+ qStart = i+1 #+1 bc index starts with zero
+ break
+
+ #query: dangling end
+ if est_align[est_len-1]==4:
+ list_idx = range(est_len)
+ list_idx.reverse()
+ for i in list_idx:
+ if est_align[i]!=4:
+ qEnd = i+1 #last "4", +1 bc index starts with zero
+ break
+ #qStarts: list of exon starting points on EST
+ #qEnds: list of exon ending points on EST
+ #(2 in est_align means: first nucleotideof new exon)
+
+ index_count = 0
+ exon_label = est_align[qStart-1]
+ qStarts.append(qStart)
+ for i in range(qStart-1,qEnd):
+ if (est_align[i]==1) and (exon_label==2): #new exon labeled "1"
+ exon_label = 1
+ qStarts.append(i+1) #+1 bc index starts with zero, start of this exon saved
+ qEnds.append(i-1+1) #end of last exon labeled "2" saved
+ qExonSizes.append(qEnds[index_count] - qStarts[index_count] +1) #length of last exon
+ index_count+=1
+ elif (est_align[i]==2) and (exon_label==1): #new exon labeled "2"
+ exon_label = 2
+ qStarts.append(i+1) #+1 bc index starts with zero, start of this exon saved
+ qEnds.append(i-1+1) #end of last exon labeled "2" saved
+ qExonSizes.append(qEnds[index_count] - qStarts[index_count] +1) #length of last exon
+ index_count+=1
+ if (i == qEnd-1): #end of last exon
+ qEnds.append(i+1) #end of last exon saved
+ qExonSizes.append(qEnds[index_count] - qStarts[index_count] +1) #length of last exon
+
+ assert(len(qStarts)==len(qEnds))
+
+ if 0:
+ print 'qStart: %d' %qStart
+ print 'qEnd: %d' %qEnd
+ print qStarts
+ print qEnds
+ print qExonSizes
+ #<---------------------------------------------------------->
+
+ #<---------------------------------------------------------->
+ #transcript: dangling start
+ if splice_align[0] == 4:
+ for i in range(dna_len):
+ if splice_align[i]!=4:
+ tStart = i+1 #+1 bc index starts with zero
+ break
+ #if splice_align[tStart-1]==1: #alignment starts with intron
+ #print 'alignment starts with intron'
+
+ #transcript: dangling end
+ if splice_align[dna_len-1]==4:
+ list_idx = range(dna_len)
+ list_idx.reverse()
+ for i in list_idx:
+ if splice_align[i]!=4:
+ tEnd = i+1 #+1 bc index starts with zero
+ break
+ #if splice_align[tEnd-1]==2: #alignment ends with intron
+ #print 'alignment ends with intron'
+
+ #tStarts: list of exon starting points on DNA
+ #tEnds: list of exon ending points on DNA
+
+ index_count = 0
+ act_state = 3 #3 means intron, 0 means exon
+ for i in range(tStart-1,tEnd):
+ if (splice_align[i]==0) and (act_state==3): #first base of exon
+ tStarts.append(i+1) #+1 bc index starts with zero
+ act_state=0 #exon
+ elif (splice_align[i]==1) and (act_state==0): #exon ended one base ago
+ tEnds.append(i-1+1) #+1 bc index starts with zero
+ act_state=3 #intron
+ tExonSizes.append(tEnds[index_count] - tStarts[index_count] +1) #length
+ index_count+=1
+ if (i==tEnd-1) and (act_state==0): #end of alignment reached and still in exon
+ tEnds.append(i+1) #+1 bc index starts with zero
+ tExonSizes.append(tEnds[index_count] - tStarts[index_count] +1) #length
+
+ #<---------------------------------------------------------->
+
+
+
+ if (len(tStarts)!=len(tEnds)):
+ print len(tStarts)
+ print len(tEnds)
+ print 'tStart: %d' %tStart
+ print 'tEnd: %d' %tEnd
+ print tStarts
+ print tEnds
+ sys.exit(2)
+
+ #print 'exons on query: %d, exons on target: %d' %(len(qStarts),len(tStarts))
+ num_exons = len(tStarts)
+ #print num_exons
+ #print len(qStarts)
+
+ if 0:
+ print 'tStart: %d' %tStart
+ print 'tEnd: %d' %tEnd
+ print tStarts
+ print tEnds
+ print tExonSizes
+
+ # assert(num_exons==len(qStarts))
+
+ return qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds
+#<------------------------------------------------------------------------------>
+
+""" read fasta as dictionary """
+def read_fasta(f):
+ import sys
+ fasta=dict()
+ for s in f.readlines():
+ if s.startswith('>'):
+ key=s[1:-1]
+ print key
+ if fasta.has_key(key):
+ sys.stderr.write("ERROR: duplicated sequence name '%s'\n" %(key))
+ sys.exit(2)
+ fasta[key]=""
+ else:
+ fasta[key]+=s[:-1]
+
+ return fasta
+
+#<------------------------------------------------------------------------------>
+#
+# DNA and EST with gaps and est with intron
+# len(dna) == len(est)
+# identity: list of length exon_numbers: #matches / length(exon_with_gaps)
+#<------------------------------------------------------------------------------>
+def comp_identity(DNA, EST, strand, qStart, tStart, translation, comparison_char):
+ def set_idx(i,qNum,tNum):
+ qIDX[i] = qNum
+ tIDX[i] = tNum
+ return qIDX, tIDX
+
+ gap_char = translation[0] #e.g. "-"
+ in_char = translation[6] #e.g. "*", intron char
+ equ_char = comparison_char[0] #match char e.g. "|"
+ comp_in_char = comparison_char[1] #intron char when comparing
+ comparison = [' '] *len(EST)
+ qIDX = [-1] *len(EST)
+ tIDX = [-1] *len(DNA)
+ exon_idx = 0
+ matches = []
+ gaps = []
+ exon_length = []
+ identity = []
+ exonQuality = [] # matches - 2*gaps
+ ssQuality = []
+ state = 3 #intron:3, exon:0
+ EST_started = 0
+
+ for i in range(len(DNA)):
+ if (i>0 and i<len(DNA)-1):
+ if (EST[i] == in_char): #intron
+ (qIDX, tIDX) = set_idx(i,qIDX[i-1],tIDX[i-1]+1)
+ comparison[i] = comp_in_char
+ if (state==0): #exon finished
+ identity.append(float(matches[exon_idx])/float(exon_length[exon_idx])) #save identity for finished exon
+ exonQuality.append(float(matches[exon_idx]-2*gaps[exon_idx])/float(exon_length[exon_idx])) #save identity for finished exon
+ exon_idx+=1 #next exon starts sooner or later
+ state = 3 #now intron
+ last_ssQuality = 0.0 ;
+ last_ssLen = 0 ;
+ for k in range(5):
+ if i-k-1<0: break
+ last_ssLen+=1
+ if DNA[i-k-1]==EST[i-k-1]: last_ssQuality+=1.0
+ else:
+ if (DNA[i-k-1]=='N' and EST[i-k-1]!=gap_char) or (DNA[i-k-1]!=gap_char and EST[i-k-1]=='N'): last_ssQuality+=0.5
+ #print k,DNA[i-k-1],EST[i-k-1],last_ssQuality
+ else: #exon
+ if (state==0): #still in same exon
+ if EST_started and DNA[i]!=gap_char: # only count DNA length
+ exon_length[exon_idx] += 1 #one bp added
+ if EST[i]!=gap_char: EST_started=1 ;
+ #print i,EST[i],EST_started,exon_length[exon_idx]
+ if (DNA[i]==EST[i]):
+ matches[exon_idx]+=1 #one match added
+ else:
+ if (DNA[i]=='N' and EST[i]!=gap_char) or (EST[i]=='N' and DNA[i]!=gap_char): matches[exon_idx]+=0.5
+ else: #new exon
+ for k in range(5):
+ if i+k>=len(DNA) or i+k>len(EST): break
+ last_ssLen+=1 ;
+ if DNA[i+k]==EST[i+k]: last_ssQuality+=1.0 ;
+ else:
+ if (DNA[i+k]=='N' and EST[i+k]!=gap_char) or (DNA[i+k]!=gap_char and EST[i+k]=='N'): last_ssQuality+=0.5
+ #print k,DNA[i+k],EST[i+k],last_ssQuality
+ ssQuality.append(last_ssQuality/last_ssLen)
+ if DNA[i]!=gap_char: exon_length.append(1)
+ else: exon_length.append(0)
+ state = 0 #now exon
+ if (DNA[i]==EST[i]):
+ gaps.append(0)
+ matches.append(1)
+ else:
+ if (DNA[i]=='N' and EST[i]!=gap_char) or (EST[i]=='N' and DNA[i]!=gap_char): matches.append(0.5); gaps.append(0)
+ else: matches.append(0); gaps.append(1)
+
+ if (DNA[i]==EST[i]) and (DNA[i]!='-'): #match
+ (qIDX, tIDX) = set_idx(i,qIDX[i-1]+1,tIDX[i-1]+1)
+ comparison[i] = equ_char
+ elif (DNA[i]==EST[i]) and (DNA[i]=='-'): #strange match
+ (qIDX, tIDX) = set_idx(i,qIDX[i-1],tIDX[i-1])
+ elif (EST[i]==gap_char): #gap on est
+ (qIDX, tIDX) = set_idx(i,qIDX[i-1],tIDX[i-1]+1)
+ elif (DNA[i]==gap_char): #gap on dna
+ (qIDX, tIDX) = set_idx(i,qIDX[i-1]+1,tIDX[i-1])
+ else: #mismatch
+ (qIDX, tIDX) = set_idx(i,qIDX[i-1]+1,tIDX[i-1]+1)
+
+ elif (i==0):
+ if (EST[i] == in_char): #alignment starts with intron
+ (qIDX, tIDX) = set_idx(i,0,1)
+ comparison[i] = comp_in_char
+ else: #alignment starts with exon
+ state = 0 #go into exon_state
+ if DNA[i]!=gap_char: exon_length.append(1)
+ else: exon_length.append(0)
+ if (DNA[i]==EST[i]) and (DNA[i]!='-'): #match
+ #(qIDX, tIDX) = set_idx(i,1,1)
+ EST_started = 1
+ (qIDX, tIDX) = set_idx(i,qStart,tStart)
+ matches.append(1)
+ gaps.append(0)
+ comparison[i] = equ_char
+ elif (DNA[i]==EST[i]) and (DNA[i]=='-'): #strange match
+ (qIDX, tIDX) = set_idx(i,qStart,tStart)
+ matches.append(0)
+ gaps.append(1)
+ elif (EST[i]==gap_char): #gap on est
+ #(qIDX, tIDX) = set_idx(i,0,1)
+ (qIDX, tIDX) = set_idx(i,qStart,tStart)
+ matches.append(0)
+ gaps.append(1)
+ elif (DNA[i]==gap_char): #gap on dna
+ #(qIDX, tIDX) = set_idx(i,1,0)
+ (qIDX, tIDX) = set_idx(i,qStart,tStart)
+ matches.append(0)
+ gaps.append(1)
+ else: #mismatch
+ #(qIDX, tIDX) = set_idx(i,1,1)
+ (qIDX, tIDX) = set_idx(i,qStart,tStart)
+ gaps.append(0)
+ if (DNA[i]=='N' or EST[i]=='N'): matches.append(0.5)
+ else: matches.append(0.5)
+ elif (i==len(DNA)-1):
+ if (EST[i] == in_char): #alignment ends with intron, with exons everything ok
+ assert(state==3) #ok, because intron longer than 4
+ comparison[i] = comp_in_char
+ (qIDX, tIDX) = set_idx(i,qIDX[i-1],tIDX[i-1]+1)
+ else: #alignment ends with exon, exon finished
+ if (state==0): #finish exon
+ if DNA[i]!=gap_char: exon_length[exon_idx] += 1 #one bp added
+ else: exon_length[exon_idx] += 1 #one bp added
+ if (DNA[i]==EST[i]) and (DNA[i]!=gap_char): #match
+ matches[exon_idx]+=1 #one match added
+ comparison[i] = equ_char
+ (qIDX, tIDX) = set_idx(i,qIDX[i-1]+1,tIDX[i-1]+1)
+ elif (DNA[i]==EST[i]) and (DNA[i]==gap_char): #strange match
+ (qIDX, tIDX) = set_idx(i,qIDX[i-1],tIDX[i-1])
+ elif (EST[i]==gap_char): #gap on est
+ (qIDX, tIDX) = set_idx(i,qIDX[i-1],tIDX[i-1]+1)
+ elif (DNA[i]==gap_char): #gap on dna
+ (qIDX, tIDX) = set_idx(i,qIDX[i-1]+1,tIDX[i-1])
+ else: #mismatch
+ if (DNA[i]=='N' or EST[i]=='N'): matches[exon_idx]+=0.5
+ (qIDX, tIDX) = set_idx(i,qIDX[i-1]+1,tIDX[i-1]+1)
+ identity.append(float(matches[exon_idx])/float(exon_length[exon_idx])) #save identity for finished exon
+ exonQuality.append(float(matches[exon_idx]-2*gaps[exon_idx])/float(exon_length[exon_idx])) #save identity for finished exon
+ else: #last exon has length 1
+ if DNA[i]!=gap_char: exon_length.append(1)
+ else: exon_length.append(1)
+ if (DNA[i]==EST[i]): #match
+ matches.append(1) #one match added
+ gaps.append(0)
+ comparison[i] = equ_char
+ (qIDX, tIDX) = set_idx(i,qIDX[i-1]+1,tIDX[i-1]+1)
+ elif (EST[i]==gap_char): #gap on est
+ gaps.append(1)
+ matches.append(0)
+ (qIDX, tIDX) = set_idx(i,qIDX[i-1],tIDX[i-1]+1)
+ elif (DNA[i]==gap_char): #gap on dna
+ gaps.append(1)
+ matches.append(0)
+ (qIDX, tIDX) = set_idx(i,qIDX[i-1]+1,tIDX[i-1])
+ else: #mismatch
+ gaps.append(0)
+ if (DNA[i]=='N' or EST[i]=='N'): matches.append(0.5)
+ else: matches.append(0)
+ (qIDX, tIDX) = set_idx(i,qIDX[i-1]+1,tIDX[i-1]+1)
+ identity.append(float(matches[exon_idx])/float(exon_length[exon_idx])) #save identity for finished exon
+ exonQuality.append(float(matches[exon_idx]-2*gaps[exon_idx])/float(exon_length[exon_idx])) #save identity for finished exon
+ else:
+ print 'this "else" should not happen'
+
+ comparison = reduce(lambda x,y: x+y, comparison)
+ #print qIDX
+ #print tIDX
+ #print exon_length
+ #print ssQuality, exonQuality, identity
+
+ return exon_length, identity, ssQuality, exonQuality, comparison, qIDX, tIDX
+#<------------------------------------------------------------------------------>
+
+
+#<------------------------------------------------------------------------------>
+#
+#<------------------------------------------------------------------------------>
+def reverse_complement(DNA):
+ import string
+ import sys
+ dict = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
+
+ DNA = DNA.upper()
+ reverse_DNA = ''
+ cnt_list = range(len(DNA))
+ cnt_list.reverse()
+ for i in cnt_list:
+ if not dict.has_key(DNA[i]):
+ print 'unknown letter', DNA[i], '-> translating to N'
+ reverse_DNA = reverse_DNA + 'N' ;
+ else:
+ reverse_DNA = reverse_DNA + dict[DNA[i]]
+ return reverse_DNA
+#<------------------------------------------------------------------------------>
+
+
+#<------------------------------------------------------------------------------>
+#
+#<------------------------------------------------------------------------------>
+def parse_wublast(output, e_value_bound):
+ import numpy
+ import sys
+
+ strand_matrix='- +'
+
+ class wublast_info:
+ score = []
+ q_name = []
+ t_name = []
+ q_strand = []
+ t_strand = []
+ q_start = []
+ q_end = []
+ t_start = []
+ t_end = []
+
+ est_entry = 0
+ still_in_group = 0
+
+ est_wbi = wublast_info()
+ for line in output:
+ fields = line.strip().split('\t')
+ #print line, still_in_group
+ if float(fields[2]) <= e_value_bound:
+ if len(est_wbi.q_name)>0:
+ flag = (fields[0] == est_wbi.q_name[len(est_wbi.t_name)-1]) and (fields[1] == est_wbi.t_name[len(est_wbi.t_name)-1]) and (int(fields[3]) == group_size)
+ else:
+ flag = True
+
+ if (still_in_group == 0) or (not flag):
+ if (not still_in_group == 0):
+ sys.stderr.write('blastn output format problem near line:\n'+line)
+
+ # start a new group
+ est_wbi.score.append(float(fields[4]))
+ est_wbi.q_name.append(fields[0])
+ est_wbi.t_name.append(fields[1])
+ est_wbi.q_strand.append(strand_matrix[int(fields[16])+1])
+ est_wbi.t_strand.append(strand_matrix[int(fields[19])+1])
+ if strand_matrix[int(fields[16])+1] == '+':
+ est_wbi.q_start.append(int(fields[17]))
+ est_wbi.q_end.append(int(fields[18]))
+ else:
+ est_wbi.q_start.append(int(fields[18]))
+ est_wbi.q_end.append(int(fields[17]))
+ est_wbi.t_start.append(int(fields[20]))
+ est_wbi.t_end.append(int(fields[21]))
+ still_in_group = int(fields[3])-1
+ #print 'still_in_group', still_in_group
+ group_size = int(fields[3])
+ else:
+ # extend a group
+ assert(fields[0] == est_wbi.q_name[len(est_wbi.t_name)-1])
+ assert(fields[1] == est_wbi.t_name[len(est_wbi.t_name)-1])
+ assert(int(fields[3]) == group_size)
+ est_wbi.score[len(est_wbi.score)-1] += float(fields[4])
+ if strand_matrix[int(fields[16])+1] == '+':
+ est_wbi.q_start[len(est_wbi.q_start)-1] = numpy.minimum(est_wbi.q_start[len(est_wbi.q_start)-1],int(fields[17]))
+ est_wbi.q_end[len(est_wbi.q_end)-1] =numpy.maximum(est_wbi.q_end[len(est_wbi.q_end)-1],int(fields[18]))
+ else:
+ est_wbi.q_start[len(est_wbi.q_start)-1] =numpy.minimum(est_wbi.q_start[len(est_wbi.q_start)-1],int(fields[18]))
+ est_wbi.q_end[len(est_wbi.q_end)-1] =numpy.maximum(est_wbi.q_end[len(est_wbi.q_end)-1],int(fields[17]))
+ est_wbi.t_start[len(est_wbi.t_start)-1] =numpy.minimum(est_wbi.t_start[len(est_wbi.t_start)-1],int(fields[20]))
+ est_wbi.t_end[len(est_wbi.t_end)-1] =numpy.maximum(est_wbi.t_end[len(est_wbi.t_end)-1],int(fields[21]))
+ still_in_group -= 1
+
+ if 0:
+ print est_wbi.q_name
+ print est_wbi.t_name
+ print est_wbi.q_strand
+ print est_wbi.t_strand
+ print est_wbi.q_start
+ print est_wbi.q_end
+ print est_wbi.t_start
+ print est_wbi.t_end
+
+ return est_wbi
+#<------------------------------------------------------------------------------>
--- /dev/null
+import string
+
+class predictions(object):
+ def __init__(self, positions=None, scores=None):
+ self.positions=positions
+ self.scores=scores
+
+ def set_positions(self, positions):
+ self.positions=positions;
+ def get_positions(self):
+ return self.positions
+
+ def set_scores(self, scores):
+ self.scores=scores
+ def get_scores(self):
+ return self.scores
+
+ def __str__(self):
+ return 'positions: ' + `self.positions` + 'scores: ' + `self.scores`
+ def __repr__(self):
+ return self.__str__()
+
+class sequence(object):
+ def __init__(self, name, seq, (start,end)):
+ assert(start<end<=len(seq))
+ self.start=start
+ self.end=end
+ self.name=name
+ self.seq=seq
+ self.preds=dict()
+ self.preds['acceptor']=predictions()
+ self.preds['donor']=predictions()
+
+ def __str__(self):
+ s="start:" + `self.start`
+ s+=" end:" + `self.end`
+ s+=" name:" + `self.name`
+ s+=" sequence:" + `self.seq[0:10]`
+ s+="... preds:" + `self.preds`
+ return s
+ def __repr__(self):
+ return self.__str__()
+
+def seqdict(dic):
+ """ takes a fasta dict as input and
+ generates a list of sequence objects from it """
+ sequences=list()
+
+ #translate string to ACGT / all non ACGT letters are mapped to A
+ tab=''
+ for i in xrange(256):
+ if chr(i).upper() in 'ACGT':
+ tab+=chr(i).upper()
+ else:
+ tab+='A'
+
+ for seqname in dic:
+ seq=string.translate(dic[seqname], tab)
+ seq=seq.upper()
+ #if end<0:
+ # stop=len(seq)+end
+ #else:
+ # stop=end
+
+ sequences.append(sequence(seqname, seq, (0,len(seq))))
+ #sequences.append(seq)
+
+ return sequences
--- /dev/null
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2006-2007 Soeren Sonnenburg
+# Written (W) 2007 Gunnar Raetsch
+# Copyright (C) 2007 Fraunhofer Institute FIRST and Max-Planck-Society
+#
+
+import sys
+import numpy
+import seqdict
+import model
+
+from shogun.Classifier import SVM
+from shogun.Features import StringCharFeatures,DNA
+# from shogun.Kernel import WeightedDegreeCharKernel
+# In the svn version of shogun, this has been renamed.
+from shogun.Kernel import WeightedDegreeStringKernel
+
+class svm_splice_model(object):
+ def __init__(self, order, traindat, alphas, b, (window_left,offset,window_right), consensus):
+
+ f=StringCharFeatures(DNA)
+ f.set_string_features(traindat)
+ # In the svn version of shogun, this has been renamed.
+ wd_kernel = WeightedDegreeStringKernel(f,f, int(order))
+ # wd_kernel = WeightedDegreeCharKernel(f,f, int(order))
+ wd_kernel.io.set_target_to_stderr()
+
+ self.svm=SVM(wd_kernel, alphas, numpy.arange(len(alphas), dtype=numpy.int32), b)
+ self.svm.io.set_target_to_stderr()
+ self.svm.parallel.set_num_threads(self.svm.parallel.get_num_cpus())
+ #self.svm.set_linadd_enabled(False)
+ #self.svm.set_batch_computation_enabled(true)
+
+ self.window_left=int(window_left)
+ self.window_right=int(window_right)
+
+ self.consensus=consensus
+ self.wd_kernel=wd_kernel
+ self.traindat=f
+ self.offset = offset ;
+
+ def get_positions(self, sequence):
+ """DEPRECATED: Please use get_positions_from_seqdict"""
+ print "DEPRECATED: Please use get_positions_from_seqdict"
+ positions=list()
+
+ for cons in self.consensus:
+ l=sequence.find(cons)
+ while l>-1:
+ #if l<len(sequence)-self.window_right-2 and l>self.window_left:
+ positions.append(l+self.offset)
+ l=sequence.find(cons, l+1)
+
+ positions.sort()
+ return positions
+
+ def get_predictions(self, sequence, positions):
+ """DEPRECATED: Please use get_predictions_from_seqdict"""
+ print "DEPRECATED: Please use get_predictions_from_seqdict"
+ seqlen=self.window_right+self.window_left+2
+ num=len(positions)
+
+ #testdat = numpy.chararray((seqlen,num),1,order='FORTRAN')
+ testdat = num*[[]]
+
+ for j in xrange(num):
+ i=positions[j] - self.offset ;
+ start = i-self.window_left
+ if start<0:
+ s_start='A'*(-start)
+ start = 0;
+ else:
+ s_start = ''
+ stop = i+self.window_right+2
+ if stop>len(sequence):
+ s_stop = 'A'*(stop-len(sequence))
+ stop=len(sequence) - 1 ;
+ else:
+ s_stop = '' ;
+ s= s_start + sequence[start:stop] + s_stop
+ testdat[:,j]=list(s)
+ testdat[j]=s
+
+ t=StringCharFeatures(DNA)
+ t.set_string_features(testdat)
+
+ self.wd_kernel.init(self.traindat, t)
+ l=self.svm.classify().get_labels()
+ sys.stderr.write("\n...done...\n")
+ return l
+
+
+ def get_predictions_from_seqdict(self, seqdic, site):
+ """ we need to generate a huge test features object
+ containing all locations found in each seqdict-sequence
+ and each location (this is necessary to efficiently
+ (==fast,low memory) compute the splice outputs
+ """
+
+ #seqlen=self.window_right+self.window_left+2
+
+ num=0
+ for s in seqdic:
+ num+= len(s.preds[site].positions)
+
+ #testdat = numpy.chararray((seqlen,num), 1, order='FORTRAN')
+ testdat = num*[[]]
+
+ k=0
+ si = 0 ;
+ for s in seqdic:
+ sequence=s.seq
+ positions=s.preds[site].positions
+ si += 1
+ for j in xrange(len(positions)):
+ if len(positions)>50000 and j%10000==0:
+ sys.stderr.write('sequence %i: progress %1.2f%%\r' %(si,(100*j/len(positions))))
+ i=positions[j] - self.offset
+ start = i-self.window_left
+ if start<0:
+ s_start='A'*(-start)
+ start = 0;
+ else:
+ s_start = ''
+ stop = i+self.window_right+2
+ if stop>len(sequence):
+ s_stop = 'A'*(stop-len(sequence) +1)
+ stop=len(sequence) - 1 ;
+ else:
+ s_stop = '' ;
+ s= s_start + sequence[start:stop] + s_stop
+ #print len(s)
+ #s=sequence[i-self.window_left:i+self.window_right+2]
+ #testdat[:,k]=list(s)
+ testdat[k]=s
+ k+=1
+
+ if len(positions)>50000:
+ sys.stderr.write('\n')
+
+ t=StringCharFeatures(DNA)
+ t.set_string_features(testdat)
+
+ self.wd_kernel.init(self.traindat, t)
+ l=self.svm.classify().get_labels()
+ sys.stderr.write("\n...done...\n")
+
+ k=0
+ for s in seqdic:
+ num=len(s.preds[site].positions)
+ scores= num * [0]
+ for j in xrange(num):
+ scores[j]=l[k]
+ k+=1
+ s.preds[site].set_scores(scores)
+
+ def get_positions_from_seqdict(self, seqdic, site):
+ for d in seqdic:
+ positions=list()
+ sequence=d.seq
+ for cons in self.consensus:
+ l=sequence.find(cons)
+ while l>-1:
+ #if l<len(sequence)-self.window_right-2 and l>self.window_left:
+ positions.append(l+self.offset)
+ l=sequence.find(cons, l+1)
+ positions.sort()
+ d.preds[site].set_positions(positions)
+
+
+class signal_detectors(object):
+ def __init__(self, model, donor_splice_use_gc):
+ if donor_splice_use_gc:
+ donor_consensus=['GC','GT']
+ else:
+ donor_consensus=['GT']
+
+ self.acceptor=svm_splice_model(model.acceptor_splice_order, model.acceptor_splice_svs,
+ numpy.array(model.acceptor_splice_alphas).flatten(), model.acceptor_splice_b,
+ (model.acceptor_splice_window_left, 2, model.acceptor_splice_window_right), ['AG'])
+ self.donor=svm_splice_model(model.donor_splice_order, model.donor_splice_svs,
+ numpy.array(model.donor_splice_alphas).flatten(), model.donor_splice_b,
+ (model.donor_splice_window_left, 0, model.donor_splice_window_right),
+ donor_consensus)
+
+ def set_sequence(self, seq):
+ self.acceptor.set_sequence(seq)
+ self.donor.set_sequence(seq)
+
+ def predict_acceptor_sites(self, seq):
+ pos=self.acceptor.get_positions(seq)
+ sys.stderr.write("computing svm output for acceptor positions\n")
+ pred=self.acceptor.get_predictions(seq, pos)
+ return (pos,pred)
+
+ def predict_donor_sites(self,seq):
+ pos=self.donor.get_positions(seq)
+ sys.stderr.write("computing svm output for donor positions\n")
+ pred=self.donor.get_predictions(seq, pos)
+ return (pos,pred)
+
+ def predict_acceptor_sites_from_seqdict(self, seqs):
+ self.acceptor.get_positions_from_seqdict(seqs, 'acceptor')
+ sys.stderr.write("computing svm output for acceptor positions\n")
+ self.acceptor.get_predictions_from_seqdict(seqs, 'acceptor')
+
+ def predict_donor_sites_from_seqdict(self, seqs):
+ self.donor.get_positions_from_seqdict(seqs, 'donor')
+ sys.stderr.write("computing svm output for donor positions\n")
+ self.donor.get_predictions_from_seqdict(seqs, 'donor')
+
--- /dev/null
+#!/usr/bin/env python2.4
+
+import sys
+#import os
+import shutil
+from distutils.core import setup, Extension
+
+try:
+ import numpy
+except:
+ sys.stderr.write("WARNING: did not find 'numpy'\n")
+
+try:
+ import shogun
+except:
+ sys.stderr.write("WARNING: did not find 'shogun'\n")
+ sys.stderr.write(" shogun is required for computing splice site scores\n")
+ sys.stderr.write(" See shogun website: http://www.shogun-toolbox.org)\n")
+
+if sys.argv[1] != 'sdist':
+ sys.stderr.write('Using swig to construct interface file alignment_tool.py\n')
+ setup (name = 'palma._alignment_tool',
+ version = '0.3.7',
+ description = '!!!This is a hack to ensure that swig runs first. Real setup is below!!!',
+ author = ['Uta Schulze', 'Bettina Hepp', 'Gunnar Raetsch', \
+ 'Soeren Sonnenburg','Cheng Soon Ong'],
+ author_email = ['Gunnar.Raetsch@tuebingen.mpg.de'],
+ license='GPL2',
+ url = 'http://www.fml.tuebingen.mpg.de/raetsch/projects/palma',
+ ext_modules = [Extension('palma._alignment_tool',
+ sources = ['src/bound_matrix.cpp', 'src/myalign_1best.cpp', \
+ 'src/fill_matrix_1best.cpp', 'src/result_align_1best.cpp',\
+ 'src/alignment_tool_1best.i'])],
+ options={'build_ext':{'swig_opts':'-c++'}},
+ )
+ shutil.copy2('src/alignment_tool.py','palma/alignment_tool.py')
+ #os.rename('src/alignment_tool.py','palma/alignment_tool.py')
+
+sys.stderr.write('PALMA - mRNA to Genome Alignments using Large Margin Algorithms\n')
+setup (name = 'palma',
+ version = '0.3.7',
+ description = 'PALMA - mRNA to Genome Alignments using Large Margin Algorithms',
+ author = ['Uta Schulze', 'Bettina Hepp', 'Gunnar Raetsch', \
+ 'Soeren Sonnenburg','Cheng Soon Ong'],
+ author_email = ['Gunnar.Raetsch@tuebingen.mpg.de'],
+ license='GPL2',
+ url = 'http://www.fml.tuebingen.mpg.de/raetsch/projects/palma',
+ #py_modules=['palma.palma_utils','palma.alignment_tool','palma.genomic',\
+ py_modules=['palma.palma_utils','palma.genomic',\
+ 'palma.model','palma.output_formating','palma.seqdict','palma.signal_detectors'],
+ ext_modules = [Extension('palma._alignment_tool',
+ sources = ['src/bound_matrix.cpp', 'src/myalign_1best.cpp', \
+ 'src/fill_matrix_1best.cpp', 'src/result_align_1best.cpp',\
+ 'src/alignment_tool_1best.i'])],
+ packages=['palma'],
+ package_data={'palma': ['data/*.param.bz2']},
+ scripts=['scripts/palma-align.py'],
+ options={'build_ext':{'swig_opts':'-c++'}},
+ long_description='''
+ PALMA aligns two genetic sequences 'the best way' according
+ to its underlying algorithm and trained parameters. See the paper on the website for details.''')
+
+# cleanup swig outputs so that each build is clean
+#os.remove('palma/alignment_tool.py')
+#os.remove('src/alignment_tool_1best_wrap.cpp')
--- /dev/null
+#!/usr/bin/env python
+# -*- coding:latin-1 -*-
+
+import sys
+import cPickle
+
+ground_truth = cPickle.load(sys.argv[1])
+prediction = cPickle.load(sys.argv[2])
+
+ESTs = ground_truth['TestEsts']
+Exons = ground_truth['TestExon']
+Acc = ground_truth['TestAcc']
+Don = ground_truth['TestDon']
+DNA = ground_truth['Test']
+
+totalWrongPos = [0]*len(prediction)
+
+for pos,alignmentObj in enumerate(prediction):
+ predPos = zips(alignmentObj.tStarts, alignmentObj.tEnds)
+ wrongPos = 0
+
+ for predExonStart,predExonStop in predPos:
+
+ for pos,currentExons in enumerate(Exons):
+ for idx in range(1,len(currentExons)-1):
+
+ intronStart = currentExons[idx][1]
+ intronStop = currentExons[idx+1][0]
+
+ # est is covering full intron
+ if intronStart > predExonStart and intronStop < predExonStop:
+ #fprintf(fh,sprintf(' %s is completely overlapping from %d %d\n',currentESTName,intronStart,intronStop));
+ wrongPos += intronStop-intronStart
+ # est is nested inside intron
+ elif intronStart < predExonStart and intronStop > predExonStop:
+ #fprintf(fh,sprintf(' %s is completely inside intron from %d %d\n',currentESTName,predExonStart,predExonStop));
+ wrongPos += intronStop-intronStart
+ # end of exonth is nested inside predExoniction
+ elif intronStart > predExonStart and predExonStop > intronStart and intronStop > predExonStop:
+ #fprintf(fh,sprintf('%s is upstream overlapping from %d %d\n',currentESTName,intronStart,predExonStop));
+ wrongPos += intronStop-intronStart
+ # predExoniction is nested inside exonth
+ elif intronStart < predExonStart and predExonStart < intronStop and intronStop < predExonStop:
+ #fprintf(fh,sprintf('%s is downstream overlapping from %d %d\n',currentESTName,predExonStart,intronStop));
+ wrongPos += intronStop-intronStart
+ else:
+ pass
+
+ totalWrongPositions[pos] = wrongPos