adding some utility code I've found helpful when working the Tribble index code

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4108 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2010-08-25 15:38:47 +00:00
parent 4eff69d95e
commit bc0826801c
7 changed files with 496 additions and 0 deletions

9
ruby/README 100644
View File

@ -0,0 +1,9 @@
This file is a roadmap to the contents of the Ruby directory
Contents:
------------------------------------------------------------------------------------
restartBamboo.rb - a script used to restart Bamboo after system failures
validateIndex.rb - a script for working with Tribble indexes, type 'validateIndex.rb' -h for help
utils/ - any basic utility methods
index/ - the index utility code

View File

@ -0,0 +1,70 @@
require './util/BinaryFileReader'
# this base class for all index types (at least linear and tree)
class Index
attr_reader :type, :headerVersion, :fileName, :fileSize, :t5, :md5, :flags
# construct, given a file name
def initialize(fileName)
@inputFile = fileName
@file = BinaryFileReader.new(fileName)
magic = @file.readBytes(4)
if (magic != "TIDX")
print "#{@inputFile}: !! Magic number is not what we expected, TIDX, instead we saw #{magic} !!\n"
exit(1)
end
@type = @file.readInt()
@headerVersion = @file.readInt()
@fileName = @file.readString()
@fileSize = @file.readLong()
@ts = @file.readLong()
@md5 = @file.readString()
@flags = @file.readUInt()
@seqDict = readSeqDictionary() if (@flags == 32768)
@propCount = readPropertyDictionary() if (@headerVersion >= 3)
end
def validate()
f = Proc.new{ print "#{@inputFile}:\t\terror: invalid type, we saw #{@type} but expected [1-2]\n"; return false} if @type < 1 or @type > 2
f = Proc.new{ print "#{@inputFile}:\t\terror: invalid header version, we saw #{@headerVersion} but expected [1-3]\n"; return false} if @headerVersion < 1 or @headerVersion > 3
f = Proc.new{ print "#{@inputFile}:\t\twarning: on fileName, we saw '#{@fileName}' but expected actual text\n"; return false} if @fileName == ""
f = Proc.new{ print "#{@inputFile}:\t\twarning: on TS, we saw '#{@ts}' but expected actual text\n"; return false} if @ts == ""
f = Proc.new{ print "#{@inputFile}:\t\twarning: on md5, we saw '#{@md5}' but expected actual text\n"; return false} if @md5 == ""
f.call if f != nil
return true
end
# diff two headers
def diffHeader(otherIndex)
self.instance_variables.each { |var|
next if "#{var}" == "@file" or "#{var}" == "@sequences"
puts "Other header doesn't define #{var}" if !(otherIndex.instance_variable_defined?(var))
one = (self.instance_variable_get(var)).to_s
two = (otherIndex.instance_variable_get(var)).to_s
puts "type #{var} not equal, #{one} != #{two}" if one != two
}
end
# read the sequence dictionary, assuming we have one
def readSeqDictionary()
sequences = []
count = @file.readInt()
count.times {|index|
sequences.add(@file.readString())
@file.readInt() # drop the sizes for now
}
sequences # return sequences
end
# read the sequence dictionary, assuming we have one
def readPropertyDictionary()
sequences = {}
count = @file.readInt()
count.times {|index|
sequences.put(@file.readString(),@file.readString()) }
sequences # return sequences
end
def close()
@file.close()
end
end

View File

@ -0,0 +1,41 @@
# the implementation of the interval index class
$LOAD_PATH << File.dirname(__FILE__)
require "Index.rb"
class IntervalIndex < Index
def initialize(file)
super(file)
@nSeq = @file.readInt()
@sequences = Array.new()
@nSeq.times {|index|
@sequences.push(TISeqEntry.new(@file))
}
end
def diff(otherIndex)
diffHeader(otherIndex)
if (otherIndex.type != @type)
print "Indexes are not the same type (#{otherIndex.type} != #{@type})\n"
return false
end
ret = false
end
end
class TISeqEntry
def initialize(file)
@contig = file.readString()
@binCount = file.readInt()
@startPositions = Array.new()
@endPositions = Array.new()
@positions = Array.new()
@sizes = Array.new()
@binCount.times { |index|
@startPositions.push(file.readInt())
@endPositions.push(file.readInt())
@positions.push(file.readLong())
@sizes.push(file.readInt())
}
end
end

View File

@ -0,0 +1,59 @@
# the linear index class implementation
$LOAD_PATH << File.dirname(__FILE__)
require "Index.rb"
class LinearIndex < Index
attr_accessor :nSeq, :sequences
def initialize(file)
super(file)
@nSeq = @file.readInt()
@sequences = Array.new()
@nSeq.times {|index|
@sequences.push(LISeqEntry.new(@file))
}
end
def diff(otherIndex)
diffHeader(otherIndex)
if (otherIndex.type != @type)
print "Indexes are not the same type (#{otherIndex.type} != #{@type})\n"
return false
end
ret = false
notInOther = @sequences.reject {|item|
return true if !otherIndex.sequences.include?(item)
item.diff(otherIndex.sequences[otherIndex.sequences.index(item)])
}
notInOther.pretty_print
end
end
class LISeqEntry
def initialize(file)
@contig = file.readString()
@binWidth = file.readInt()
@binCount = file.readInt()
@longestFeature = file.readInt()
@maxBin = file.readInt()
@totalBin = file.readInt()
@startPositions = Array.new()
@binCount.times { |index|
@startPositions.push(file.readLong())
}
@finalPos = file.readLong()
end
# print a summary of the index characteristics
def diff(otherLISeqEntry)
self.instance_variables.each { |var|
next if "#{var}" == "@file" or "#{var}" == "@sequences"
puts "Other LISeqEntry doesn't define #{var}" if !(otherLISeqEntry.instance_variable_defined?(var))
one = (self.instance_variable_get(var)).to_s
two = (otherLISeqEntry.instance_variable_get(var)).to_s
puts "otherLISeqEntry: type #{var} not equal, #{one} != #{two}" if one != two
}
end
end

View File

@ -0,0 +1,170 @@
# this ruby files takes two indexes (of the same type) and diff's them. If they're different types,
# it'll stop after the header
# a function to exit, printing a message
def exitWithError(message)
puts ###########################################
puts message
puts ###########################################
exit(1)
end
if (ARGV.size != 2)
exitWithError("We take two files as input, try again!")
end
# open the indexes
index1 = File.new(ARGV[0])
index2 = File.new(ARGV[1])
# a helper function for comparing values
def compValues(file1, file2, byteCount, type)
index1Type = file1.sysread(byteCount)
index2Type = file2.sysread(byteCount)
if (index1Type != index2Type)
print "#{type}, index1 (#{index1Type}) != index2 (#{index2Type})\n"
else
print "#{type}, index1 (#{index1Type}) == index2 (#{index2Type})\n"
end
end
# a helper function for comparing values
def compInts(file1, file2, byteCount, type)
if (byteCount > 4)
upack = "q"
else
upack = "i"
end
index1Type = file1.sysread(byteCount).unpack(upack)
index2Type = file2.sysread(byteCount).unpack(upack)
if (index1Type != index2Type)
print "#{type}, index1 (#{index1Type}) != index2 (#{index2Type})\n"
else
print "#{type}, index1 (#{index1Type}) == index2 (#{index2Type})\n"
end
end
# a helper function for reading strings
def readString(index)
buffer = []
ch = index.sysread(1)
while (ch != "\0")
buffer.push(ch)
ch = index.sysread(1)
end
buffer.to_s
end
# validate the magic number from both
exitWithError("Magic number not valid for index 1") if index1.sysread(4) != "TIDX"
exitWithError("Magic number not valid for index 2") if index2.sysread(4) != "TIDX"
# validate the types
compInts(index1,index2,4,"types")
# validate the versions
v1 = index1.sysread(4).unpack("i")
v2 = index2.sysread(4).unpack("i")
if (v1 != v2)
print "version, index1 (#{v1}) != index2 (#{v2})\n"
else
print "version, index1 (#{v1}) == index2 (#{v2})\n"
end
# validate the filenames
fl1 = readString(index1)
fl2 = readString(index2)
if (fl1 != fl2)
print "filename, index1 (#{fl1}) != index2 (#{fl2})\n"
else
print "filename, index1 (#{fl1}) == index2 (#{fl2})\n"
end
# validate the sizes
compInts(index1,index2,8,"sizes")
# validate the T5?
compValues(index1,index2,8,"T5")
# validate the MD5 - just a byte, we don't write the MD5 sums in yet
# validate the filenames
fl1 = readString(index1)
fl2 = readString(index2)
if (fl1 != fl2)
print "md5, index1 (#{fl1}) != index2 (#{fl2})\n"
else
print "md5, index1 (#{fl1}) == index2 (#{fl2})\n"
end
# validate the flags
index1Flags = (index1.sysread(4)).unpack("L")
index2Flags = (index2.sysread(4)).unpack("L")
if (index1Flags != index2Flags)
print "Flags are different, index1 = #{index1Flags[0]}, index2 = #{index2Flags[0]}\n"
end
def readSeqDictionary(file)
puts "reading seq dict"
sequences = []
count = (file.sysread(4)).unpack("i")
puts count
count[0].times {|index|
sequences.add(readString(file).to_s)
file.sysread(4) # drop the sizes for now
}
sequences # return sequences
end
if (index1Flags[0] == 32768)
puts readSeqDictionary(index1)
elsif (index2Flags[0] == 32768)
puts readSeqDictionary(index2)
end
# bump off the prop dictionary
index1.sysread(4) if (v1[0] == 3)
index2.sysread(4) if (v2[0] == 3)
def readSeqEntry(i1)
puts "--------------------------------------------"
puts "Contig --> #{readString(i1).to_s}"
print "bin width = #{(i1.sysread(4)).unpack("i")}\n"
binCount = (i1.sysread(4)).unpack("i")[0]
print "number of bins = #{binCount}\n"
print "longest feature = #{(i1.sysread(4)).unpack("i")}\n"
print "max bin size = #{(i1.sysread(4)).unpack("i")}\n"
print "total bin size = #{(i1.sysread(4)).unpack("i")}\n"
lastStartPos = -1
binCount.times { |index|
startPos = (i1.sysread(8)).unpack("q")[0]
#if (startPos < lastStartPos)
puts "bin at index #{index}, this start = #{startPos}, last start = #{lastStartPos}"
#end
lastStartPos = startPos
}
end
def compContigLinear(i1, i2)
seqC1 = (i1.sysread(4)).unpack("L")
seqC2 = (i2.sysread(4)).unpack("L")
print "seq count 1 = #{seqC1}, count 2 = #{seqC2}\n"
puts "\nentries for index 1"
seqC1[0].times { |index|
readSeqEntry(i1)
}
puts "\nentries for index 2"
seqC2[0].times { |index|
readSeqEntry(i2)
}
end
compContigLinear(index1,index2)
print "Done!\n"
# close the files
index1.close()
index2.close()

View File

@ -0,0 +1,44 @@
# a ruby class for reading in binary files; really this just adds some conv. methods like readInt(), readLong(), etc.
class BinaryFileReader
# constructor
def initialize(fileName)
@file = File.open(fileName,"r")
end
# read and return an int (4 byte, signed, machine based endian)
def readInt()
(@file.sysread(4)).unpack("i")[0]
end
# read and return an int (4 byte, unsigned, machine based endian)
def readUInt()
(@file.sysread(4)).unpack("L")[0]
end
# read and return an long (8 byte, signed, machine based endian)
def readLong()
(@file.sysread(8)).unpack("q")[0]
end
# read and return a set number of bytes as a string
def readBytes(count)
(@file.sysread(count)).to_s
end
# read and return a null terminated string
def readString()
buffer = []
ch = @file.sysread(1)
while (ch != "\0")
buffer.push(ch)
ch = @file.sysread(1)
end
buffer.to_s
end
# close the file
def close()
@file.close()
end
end

View File

@ -0,0 +1,103 @@
# this ruby files validates a linear index
# set the include path to include the current directory
$LOAD_PATH << File.dirname(__FILE__)
# require a couple of files
require "index/Index.rb"
require "index/LinearIndex.rb"
require "index/IntervalIndex.rb"
require "optparse"
require "yaml"
# This hash will hold all of the options
# parsed from the command-line by
# OptionParser.
options = {}
optparse = OptionParser.new do|opts|
# Set a banner, displayed at the top
# of the help screen.
opts.banner = "Usage: ruby validateIndex.rb [options] file1 file2 ..."
# Define the options, and what they do
options[:index] = [] if options[:index] == nil
opts.on( '-i', '--index INDEX (REQUIRED)', 'Specify the index. Multiple are allowed' ) do |file| options[:index].push(file) end
options[:verbose] = false
opts.on( '-v', '--verbose', 'Output more information' ) do options[:verbose] = true end
options[:validate] = false
opts.on( '-c', '--check', 'Check (Validate) the index(es) passed in as parameters' ) do options[:check] = true end
options[:diff] = false
opts.on( '-d', '--diff', 'Diff two indexes' ) do options[:diff] = true end
options[:print] = false
opts.on( '-p', '--print', 'Print all of the information about the file' ) do options[:print] = true end
# This displays the help screen, all programs are
# assumed to have this option.
opts.on_tail( '-h', '--help', 'Display this screen' ) do
puts opts
exit
end
end
# parse the command line
optparse.parse!
#Now raise an exception if we have not found a host option
if options[:index].size == 0
puts "you must at least specify an index file!"
puts optparse
end
# a function to load an index
def loadIndex(file)
indexTry = Index.new(file)
indexTry.close()
if (indexTry.type == 1)
puts "Linear index..."
index = LinearIndex.new(file)
else
puts "Interval index..."
index = IntervalIndex.new(file)
end
index
end
#################### Control Block ####################
# load all of the indexes
indexes = []
options[:index].each {|indexFile|
indexes.push(loadIndex(indexFile))
}
# switch on the flags supplied
if (options[:diff])
if (options[:index].size != 2)
print "Unable to diff indexes if you don't supply two and only two indexes\n";
exit(1)
else
indexes[0].diff(indexes[1])
end
elsif (options[:validate])
indexes.each {|index| index.validate() }
elsif (options[:print])
indexes.each {|index| puts YAML::dump( index ) }
end
# if they specified validate
if (options[:check])
options[:index].each {|index|
idx = Index.new(index).validate()
}
end
exit