--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+class Gene:
+
+ chromosome = ''
+ strand = ''
+ start = -1
+ stop = -1
+ exons = []
+ fiveUTR = []
+ threeUTR = []
+
+
+ def __init__(self,chr,begin,end,strand):
+ assert chr != ''
+ assert begin >= 0 and begin <= end and end >= 0
+ assert strand in ['+','-']
+
+ self.chromosome = chr
+ self.start = begin
+ self.stop = end
+ self.strand = strand
+
+
+ def addExon(self,start,stop):
+ self.exons.append((start,stop))
+
+ def load(self,filename):
+ pass
+
+ def save(self,filename):
+ pass
--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+class Read:
+
+ chromosome = ''
+ position = 0
+ sequence = ''
+ strand = ''
+
+ # mismatch = 0
+ # repeats = 0
+ # length = 0
+ # deletion = 0
+
+ prob = None
+ calibratedProb = None
+ chastity = None
+
+ def __init__(self,chr,pos,seq,strand,prob,calibrated,chastity):
+ self.chromosome = chr
+ self.position = pos
+ self.sequence = seq
+ self.strand = strand
+ self.prob = prob
+ self.calibratedProb = calibrated
+ self.chastity = chastity
--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import os.path
+import csv
+from PyGff import *
+import cPickle
+
+info=\
+"""
+You have to supply two files. One containing the gff information and the other
+one containing the information of the Solexa(R) reads.
+
+Usage: ./createTestSet.py gff.pickle reads.pickle
+"""
+
+doc=\
+"""
+Make sure that you split the data into chromosome files beforehand because
+this method does not check the chromosome info
+"""
+
+def check(annot,reads):
+ print len(annot)
+ print len(reads)
+
+
+
+
+if __name__ == '__main__':
+ assert len(sys.argv) >= 3, info
+ annotFile = sys.argv[1]
+ readsFile = sys.argv[2]
+ assert os.path.exists(annotFile), 'File %s does not exist!' % annotFile
+ assert os.path.exists(readsFile), 'File %s does not exist!' % readsFile
+ print doc
+
+ annotation = cPickle.load(open(annotFile))
+ reads = cPickle.load(open(readsFile))
+
+ check(annotation,reads)
+
--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import os.path
+import csv
+from PyGff import *
+import cPickle
+
+def parse(gff_fid,pickle_fid):
+ reader = csv.reader(gff_fid, delimiter='\t', quoting=csv.QUOTE_NONE)
+
+ allGenes = []
+ currentGene = None
+
+ for row in reader:
+ assert len(row) == 9
+ chr = row[0]
+ id = row[2]
+ start = int(row[3])
+ stop = int(row[4])
+ strand = row[6]
+
+ if id == 'chromosome':
+ continue
+
+ if id == 'gene':
+ if currentGene != None:
+ allGenes.append(currentGene)
+ currentGene = None
+
+ currentGene = Gene(chr,start,stop,strand)
+
+ elif id == 'five_prime_UTR':
+ pass
+
+ elif id == 'three_prime_UTR':
+ pass
+
+ elif id == 'mRNA':
+ pass
+
+ elif id == 'exon':
+ currentGene.addExon(start,stop)
+
+ elif id == 'CDS':
+ pass
+
+ elif id == 'ncRNA':
+ pass
+
+ elif id == 'pseudogenic_exon':
+ pass
+
+ elif id == 'pseudogenic_transcript':
+ pass
+
+ elif id == 'snoRNA':
+ pass
+
+ elif id == 'snRNA':
+ pass
+
+ elif id == 'miRNA':
+ pass
+
+ elif id == 'tRNA':
+ pass
+
+ elif id == 'pseudogene':
+ if currentGene != None:
+ allGenes.append(currentGene)
+ currentGene = None
+
+ else:
+ assert False, 'Error: Unknown identifier \'%s\'' % id
+
+ cPickle.dump(allGenes,pickle_fid)
+
+
+if __name__ == '__main__':
+ assert len(sys.argv) >= 3
+ annotFile = sys.argv[1]
+ pickleFile = sys.argv[2]
+ assert os.path.exists(annotFile)
+ assert not os.path.exists(pickleFile)
+
+ gff_fid = open(annotFile)
+ pickle_fid = open(pickleFile,'w+')
+ parse(gff_fid,pickle_fid)
--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import os.path
+import cPickle
+import csv
+from Solexa import Read
+
+def parse(solexa_fid,pickle_fid):
+
+ reader = csv.reader(solexa_fid, delimiter='\t', quoting=csv.QUOTE_NONE)
+
+ allReads = []
+ currentRead = None
+
+ stmap = {'P':'-','D':'+'}
+
+ for row in reader:
+ assert len(row) == 12
+ chr = row[0]
+ pos = row[1]
+ seq = row[2]
+ id = int(row[3])
+ strand = stmap[row[4]]
+ mismatch = row[5]
+ repeat = row[6]
+ length = row[7]
+ deletion = row[8]
+
+ prob = row[9]
+ calibratedProb = row[10]
+ chastity = row[11]
+
+ prob = [ord(elem)-50 for elem in prob]
+ calibratedProb = [ord(elem)-64 for elem in calibratedProb]
+ chastity = [ord(elem)+10 for elem in chastity]
+
+ currentRead = Read(chr,pos,seq,strand,prob,calibratedProb,chastity)
+ allReads.append(currentRead)
+
+ cPickle.dump(allReads,pickle_fid)
+
+if __name__ == '__main__':
+ assert len(sys.argv) >= 3
+ solexaFile = sys.argv[1]
+ pickleFile = sys.argv[2]
+ assert os.path.exists(solexaFile)
+ assert not os.path.exists(pickleFile)
+
+ solexa_fid = open(solexaFile)
+ pickle_fid = open(pickleFile,'w+')
+ parse(solexa_fid,pickle_fid)
+