+ added small scripts for EST filtering
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 11 Dec 2007 18:20:07 +0000 (18:20 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 11 Dec 2007 18:20:07 +0000 (18:20 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7018 e1793c9e-67f9-0310-80fc-b846ff1f7b36

python/TrainingParam.py
python/qpalma.py
tools/calculateAlignmentQuality.m
tools/calculateSizes [new file with mode: 0644]
tools/extractESTs [new file with mode: 0755]

index 2f7d074..038a007 100644 (file)
@@ -6,7 +6,8 @@ class Param:
    def __init__(self):
       """ default parameters """
 
-      self.basedir = '/fml/ag-raetsch/share/projects/qpalma/elegans_palma'
+      #self.basedir = '/fml/ag-raetsch/share/projects/qpalma/elegans_palma'
+      self.basedir = '/fml/ag-raetsch/share/projects/qpalma/zebrafish'
       self.MAX_MEM = 31000;
       self.LOCAL_ALIGN = 1;
       self.init_param = 1;
@@ -15,7 +16,7 @@ class Param:
       self.C = 0.001;
       self.microexon = 0;
       self.prob = 0;
-      self.organism = 'elegans'
+      self.organism = 'zebrafish'
       self.expt = 'training'
          
       self.insertion_prob  = self.prob/3 ;
index 977d340..4b5af6a 100644 (file)
@@ -3,7 +3,7 @@
 
 ###########################################################
 #
-# This file containts the 
+# This file contains the 
 #
 ###########################################################
 
@@ -311,7 +311,6 @@ class QPalma:
 
                   [h,d,a,mmatrix] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
                   
-
             if exampleIdx==10:
                break
 
index 0338a4b..bab0263 100644 (file)
@@ -9,13 +9,20 @@ load /fml/ag-raetsch/share/projects/altsplicedata/zebrafish/confirmed_sequences.
 ground_truth = genes;
 clear genes;
 
-fh = fopen('overlapping.pos','w+')
+disp('Loaded data...');
+
+fh = fopen('overlapping.pos','w+');
+
 for i = 1:length(ground_truth)
    currentEntry = ground_truth(i);
    currentExons = currentEntry.exons;
    assert (length(currentEntry.transcripts) == length(currentEntry.exons));
    numberOfEsts = length(currentEntry.transcripts);
 
+   if mod(i,100) == 0
+      fprintf('.')
+   end
+
    for j = 1:length(testrun)
       currentPred = testrun(j);
 
@@ -58,18 +65,17 @@ for i = 1:length(ground_truth)
                   %%disp(sprintf('%i %i %i %i %i %i\n',i,j,estIdx,predEstIdx,exonIdx,predExonIdx));
                
                   % est is covering full intron
-                  if intronStart >= predExonStart && intronStop <= predExonStop
-                     fprintf(fh,sprintf(' %s is completely overlapping from %d %d\n',currentESTName,intronStart,intronStop))
+                  if intronStart > predExonStart && intronStop < predExonStop
+                     fprintf(fh,sprintf(' %s is completely overlapping from %d %d\n',currentESTName,intronStart,intronStop));
                   % est is nested inside intron
-                  if intronStart < predExonStart && intronStop > predExonStop
-                     fprintf(fh,sprintf(' %s is completely inside intron from %d %d\n',currentESTName,predExonStart,predExonStop))
+                  elseif intronStart < predExonStart && intronStop > predExonStop
+                     fprintf(fh,sprintf(' %s is completely inside intron from %d %d\n',currentESTName,predExonStart,predExonStop));
                   % end of exonth is nested inside predExoniction
-                  elseif intronStart >= predExonStart && predExonStop >= intronStart && intronStop >= predExonStop
-                     fprintf(fh,sprintf('%s is upstream overlapping from %d %d\n',currentESTName,intronStart,predExonStop))
+                  elseif intronStart > predExonStart && predExonStop > intronStart && intronStop > predExonStop
+                     fprintf(fh,sprintf('%s is upstream overlapping from %d %d\n',currentESTName,intronStart,predExonStop));
                   % predExoniction is nested inside exonth
-                  elseif intronStart <= predExonStart && predExonStart <= intronStop && intronStop <= predExonStop
-                     fprintf(fh,sprintf('%s is downstream overlapping from %d %d\n',currentESTName,predExonStart,intronStop))
-                     fprintf('%s %d %d',fh)
+                  elseif intronStart < predExonStart && predExonStart < intronStop && intronStop < predExonStop
+                     fprintf(fh,sprintf('%s is downstream overlapping from %d %d\n',currentESTName,predExonStart,intronStop));
                   else
                      d=1;
                   end
diff --git a/tools/calculateSizes b/tools/calculateSizes
new file mode 100644 (file)
index 0000000..d13139d
--- /dev/null
@@ -0,0 +1,9 @@
+#!/usr/bin/env python
+
+import sys
+
+for line in open(sys.argv[1]):
+   line = line.strip()
+   slist = line.split(' ')
+   size = int(slist[-1]) - int(slist[-2])
+   print '%d %s' % (size,line)
diff --git a/tools/extractESTs b/tools/extractESTs
new file mode 100755 (executable)
index 0000000..f59f6d1
--- /dev/null
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+threshold=100
+
+cat $1 | sort | uniq > $1.uniq
+python calculateSizes $1.uniq | sort -rn | grep '$[0-9]{3,3}' > $1.uniq.over.${threshold}