+ added python parsers for gff and solexa / read files
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 17 Dec 2007 15:11:15 +0000 (15:11 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 17 Dec 2007 15:11:15 +0000 (15:11 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7156 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/data_tools/PyGff.py [new file with mode: 0644]
tools/data_tools/Solexa.py [new file with mode: 0644]
tools/data_tools/createTestSet.py [new file with mode: 0644]
tools/data_tools/parseGff.py [new file with mode: 0644]
tools/data_tools/parseSolexa.py [new file with mode: 0644]

diff --git a/tools/data_tools/PyGff.py b/tools/data_tools/PyGff.py
new file mode 100644 (file)
index 0000000..618c2aa
--- /dev/null
@@ -0,0 +1,33 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*- 
+
+class Gene:
+   
+   chromosome = ''
+   strand = ''
+   start = -1
+   stop = -1
+   exons = []
+   fiveUTR = []
+   threeUTR = []
+
+
+   def __init__(self,chr,begin,end,strand):
+      assert chr != ''
+      assert begin >= 0 and begin <= end and end >= 0
+      assert strand in ['+','-']
+
+      self.chromosome = chr
+      self.start = begin
+      self.stop = end
+      self.strand = strand
+
+
+   def addExon(self,start,stop):
+      self.exons.append((start,stop))
+
+   def load(self,filename):
+      pass
+
+   def save(self,filename):
+      pass
diff --git a/tools/data_tools/Solexa.py b/tools/data_tools/Solexa.py
new file mode 100644 (file)
index 0000000..ad48439
--- /dev/null
@@ -0,0 +1,27 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*-
+
+class Read:
+   
+   chromosome = ''
+   position = 0
+   sequence = ''
+   strand = ''
+
+   # mismatch = 0
+   # repeats = 0
+   # length = 0
+   # deletion = 0
+
+   prob = None
+   calibratedProb = None
+   chastity = None
+
+   def __init__(self,chr,pos,seq,strand,prob,calibrated,chastity):
+      self.chromosome = chr
+      self.position = pos
+      self.sequence = seq
+      self.strand = strand
+      self.prob = prob
+      self.calibratedProb = calibrated
+      self.chastity = chastity
diff --git a/tools/data_tools/createTestSet.py b/tools/data_tools/createTestSet.py
new file mode 100644 (file)
index 0000000..03c9d06
--- /dev/null
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import os.path
+import csv
+from PyGff import *
+import cPickle
+
+info=\
+"""
+You have to supply two files. One containing the gff information and the other
+one containing the information of the Solexa(R) reads.
+
+Usage: ./createTestSet.py gff.pickle reads.pickle
+"""
+
+doc=\
+"""
+Make sure that you split the data into chromosome files beforehand because
+this method does not check the chromosome info
+"""
+
+def check(annot,reads):
+   print len(annot)
+   print len(reads)
+
+   
+
+      
+if __name__ == '__main__':
+   assert len(sys.argv) >= 3, info
+   annotFile = sys.argv[1]
+   readsFile = sys.argv[2]
+   assert os.path.exists(annotFile), 'File %s does not exist!' % annotFile
+   assert os.path.exists(readsFile), 'File %s does not exist!' % readsFile
+   print doc
+
+   annotation = cPickle.load(open(annotFile))
+   reads = cPickle.load(open(readsFile))
+
+   check(annotation,reads)
+
diff --git a/tools/data_tools/parseGff.py b/tools/data_tools/parseGff.py
new file mode 100644 (file)
index 0000000..5329b8e
--- /dev/null
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import os.path
+import csv
+from PyGff import *
+import cPickle
+
+def parse(gff_fid,pickle_fid):
+   reader = csv.reader(gff_fid, delimiter='\t', quoting=csv.QUOTE_NONE)
+
+   allGenes = []
+   currentGene = None
+
+   for row in reader:
+      assert len(row) == 9
+      chr = row[0]
+      id = row[2]
+      start = int(row[3])
+      stop = int(row[4])
+      strand = row[6]
+
+      if id == 'chromosome':
+         continue
+
+      if id == 'gene':
+         if currentGene != None:
+            allGenes.append(currentGene)
+            currentGene = None
+         
+         currentGene = Gene(chr,start,stop,strand)
+
+      elif id == 'five_prime_UTR':
+         pass
+
+      elif id == 'three_prime_UTR':
+         pass
+
+      elif id == 'mRNA':
+         pass
+
+      elif id == 'exon':
+         currentGene.addExon(start,stop)
+
+      elif id == 'CDS':
+         pass
+
+      elif id == 'ncRNA':
+         pass
+
+      elif id == 'pseudogenic_exon':
+         pass
+
+      elif id == 'pseudogenic_transcript':
+         pass
+
+      elif id == 'snoRNA':
+         pass
+
+      elif id == 'snRNA':
+         pass
+
+      elif id == 'miRNA':
+         pass
+
+      elif id == 'tRNA':
+         pass
+
+      elif id == 'pseudogene':
+         if currentGene != None:
+            allGenes.append(currentGene)
+            currentGene = None
+
+      else:
+         assert False, 'Error: Unknown identifier \'%s\'' % id
+
+   cPickle.dump(allGenes,pickle_fid)
+
+      
+if __name__ == '__main__':
+   assert len(sys.argv) >= 3
+   annotFile = sys.argv[1]
+   pickleFile = sys.argv[2]
+   assert os.path.exists(annotFile)
+   assert not os.path.exists(pickleFile)
+
+   gff_fid = open(annotFile)
+   pickle_fid = open(pickleFile,'w+')
+   parse(gff_fid,pickle_fid)
diff --git a/tools/data_tools/parseSolexa.py b/tools/data_tools/parseSolexa.py
new file mode 100644 (file)
index 0000000..f562cbd
--- /dev/null
@@ -0,0 +1,54 @@
+#!/usr/bin/env python 
+# -*- coding: utf-8 -*-
+
+import sys
+import os.path
+import cPickle
+import csv
+from Solexa import Read
+
+def parse(solexa_fid,pickle_fid):
+   
+   reader = csv.reader(solexa_fid, delimiter='\t', quoting=csv.QUOTE_NONE)
+
+   allReads = []
+   currentRead = None
+
+   stmap = {'P':'-','D':'+'}
+
+   for row in reader:
+      assert len(row) == 12
+      chr = row[0]
+      pos = row[1]
+      seq = row[2]
+      id = int(row[3])
+      strand = stmap[row[4]]
+      mismatch = row[5]
+      repeat = row[6]
+      length = row[7]
+      deletion = row[8]
+
+      prob = row[9]
+      calibratedProb = row[10]
+      chastity = row[11]
+
+      prob = [ord(elem)-50 for elem in prob]
+      calibratedProb = [ord(elem)-64 for elem in calibratedProb]
+      chastity = [ord(elem)+10 for elem in chastity]
+
+      currentRead = Read(chr,pos,seq,strand,prob,calibratedProb,chastity)
+      allReads.append(currentRead)
+
+   cPickle.dump(allReads,pickle_fid)
+
+if __name__ == '__main__':
+   assert len(sys.argv) >= 3
+   solexaFile = sys.argv[1]
+   pickleFile = sys.argv[2]
+   assert os.path.exists(solexaFile)
+   assert not os.path.exists(pickleFile)
+
+   solexa_fid = open(solexaFile)
+   pickle_fid = open(pickleFile,'w+')
+   parse(solexa_fid,pickle_fid)
+