+ added some more scripts from the dataset processing
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 30 May 2008 08:08:20 +0000 (08:08 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 30 May 2008 08:08:20 +0000 (08:08 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9295 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/dataset_scripts/countNotOnChr1-5_pos.py [new file with mode: 0644]
tools/dataset_scripts/dataStatistic.py [new file with mode: 0644]
tools/dataset_scripts/mappingInfo.sh [new file with mode: 0755]

diff --git a/tools/dataset_scripts/countNotOnChr1-5_pos.py b/tools/dataset_scripts/countNotOnChr1-5_pos.py
new file mode 100644 (file)
index 0000000..cd65731
--- /dev/null
@@ -0,0 +1,61 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*- 
+
+import sys
+import qparser
+
+def check(filename):
+   #all_filtered_reads = '/fml/ag-raetsch/share/projects/qpalma/solexa/new_run/allReads.full'
+   #qparser.parse_reads(all_filtered_reads)
+
+   map = {}
+   
+   for line in open(filename):
+
+      sl = line.split()
+      id    = int(sl[0])
+      chromo= int(sl[1])
+      strand = sl[3]
+      if strand == 'D':
+         strand = '+'
+      if strand == 'P':
+         strand = '-'
+
+      if map.has_key(id):
+         current_strand,current_chromo = map[id]
+         if current_strand == '+' and current_chromo in range(1,6):
+            continue
+         else:
+            map[id] = (strand,chromo)
+      else:
+         map[id] = (strand,chromo)
+
+   total_ctr = 0
+   spliced_ctr = 0
+   correct_ctr = 0
+   correct_spliced  = 0
+   correct_unspliced = 0
+
+   for id,elem in map.iteritems():
+      current_strand,current_chromo = elem
+      if id < 1000000300000:
+         spliced_ctr += 1 
+      #print elem
+      if current_strand == '+' and current_chromo in range(1,6):
+         correct_ctr += 1
+         if id < 1000000300000:
+            correct_spliced += 1
+         else:
+            correct_unspliced += 1
+      total_ctr += 1
+
+   print 'total ctr %d' % total_ctr 
+   print 'spliced_ctr %d ' % spliced_ctr
+
+   print 'total correct %d' % correct_ctr 
+   print 'spliced %d ' % correct_spliced
+   print 'unspliced %d ' % correct_unspliced
+
+
+if __name__ == '__main__':
+   check(sys.argv[1])
diff --git a/tools/dataset_scripts/dataStatistic.py b/tools/dataset_scripts/dataStatistic.py
new file mode 100644 (file)
index 0000000..9a24597
--- /dev/null
@@ -0,0 +1,112 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*- 
+
+import sys
+
+#
+#
+#
+
+class Data:
+   pass
+
+
+
+
+class DataStatistic:
+
+   def __init__(self,reads_fn,map_fn,map2_fn,heuristic_fn,heuristic_un_fn):
+      self.reads_fn = reads_fn
+      self.map_fn = map_fn
+      self.map2_fn = map2_fn
+      self.heuristic_fn = heuristic_fn
+      self.heuristic_un_fn = heuristic_un_fn
+
+      #self.all_read_ids_set  = set([])
+      #self.spliced_ids_set   = set([])
+      #self.unspliced_ids_set = set([])
+      #self.map_ids_set       = set([])
+      #self.map_2nd_ids_set   = set([])
+
+
+   def calcStat(self,filename,check_strand):
+      spliced_set    = set([])
+      unspliced_set  = set([])
+      all_set        = set([])
+
+      for line in open(filename):
+         sl = line.strip().split()
+         id = int(sl[0])
+         chromo = int(sl[1])
+         strand = sl[3]
+
+         if check_strand and ((not chromo in range(1,6)) or strand != 'D'):
+            continue
+
+         if id < 1000000300000:
+            spliced_set.add(id)
+         else:
+            unspliced_set.add(id)
+
+         all_set.add(id)
+
+      return spliced_set,unspliced_set,all_set
+
+
+   def calculate(self):
+      all_spliced_set,all_unspliced_set,all_set = self.calcStat(self.reads_fn,False)
+      res = (len(all_spliced_set),len(all_unspliced_set),len(all_set),'total reads')
+      self.printResult(res)
+
+      check_switch = True
+      map_spliced_set,map_unspliced_set,map_set =\
+      self.calcStat(self.map_fn,check_switch)
+      res = (len(map_spliced_set),len(map_unspliced_set),len(map_set),'map reads')
+      self.printResult(res)
+
+      map2_spliced_set,map2_unspliced_set,map2_set =\
+      self.calcStat(self.map2_fn,check_switch)
+      res = (len(map2_spliced_set),len(map2_unspliced_set),len(map2_set),'map2 reads')
+      self.printResult(res)
+
+      heuristic_spliced_set,heuristic_unspliced_set,heuristic_set =\
+      self.calcStat(self.heuristic_fn,check_switch)
+      res = (len(heuristic_spliced_set),len(heuristic_unspliced_set),len(heuristic_set),'heuristic reads')
+      self.printResult(res)
+
+      heuristic_un_spliced_set,heuristic_un_unspliced_set,heuristic_un_set =\
+      self.calcStat(self.heuristic_un_fn,check_switch)
+      res = (len(heuristic_un_spliced_set),len(heuristic_un_unspliced_set),len(heuristic_un_set),'heuristic_un reads')
+      self.printResult(res)
+
+      all_found_set = map_set | map2_set
+      print 'found %d reads' % len(all_found_set)
+
+      not_found_set = all_set - all_found_set
+      print 'could not find %d reads' % len(not_found_set)
+
+      all_found_spliced_set = map_spliced_set | map2_spliced_set
+      print 'found %d spliced reads' % len(all_found_spliced_set)
+
+      not_found_spliced_set = all_spliced_set - all_found_spliced_set
+      print 'could not find %d spliced reads' % len(not_found_spliced_set)
+
+
+      all_found_unspliced_set = map_unspliced_set | map2_unspliced_set
+      not_found_unspliced_set = all_unspliced_set - all_found_unspliced_set 
+      print 'could not find %d unspliced reads' % len(not_found_unspliced_set)
+      for i in range(10):
+         print not_found_unspliced_set.pop()
+
+
+   def printResult(self,res):
+      mes = '%s:\t%d\n' % (res[3],res[2])
+      mes += 'spliced reads:\t%d (%f)\n' % (res[0],(1.0*res[0])/res[2])
+      mes += 'unspliced reads:\t%d (%f)\n' % (res[1],(1.0*res[1])/res[2])
+      print mes
+
+
+if __name__ == '__main__':
+
+   ds = DataStatistic(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5])
+   ds.calculate()
diff --git a/tools/dataset_scripts/mappingInfo.sh b/tools/dataset_scripts/mappingInfo.sh
new file mode 100755 (executable)
index 0000000..f8b34bc
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+dir=$1
+
+num=`cut -f1 $dir/main/reads_0.fl | sort -u | wc -l`
+echo "Started 1st vmatch run with $num reads."
+
+num=`cut -f1 $dir/main/map.vm | sort -u | wc -l`
+echo "Found $num reads in 1st vmatch run."
+
+num=`cut -f1 $dir/spliced/reads_0.fl | sort -u | wc -l`
+echo "Started 2nd vmatch run with $num reads."
+
+num=`cut -f1 $dir/spliced/map.vm | sort -u | wc -l`
+echo "Found $num reads in 2nd vmatch run"