adding some utility code I've found helpful when working the Tribble index code
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4108 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4eff69d95e
commit
bc0826801c
|
|
@ -0,0 +1,9 @@
|
||||||
|
This file is a roadmap to the contents of the Ruby directory
|
||||||
|
|
||||||
|
Contents:
|
||||||
|
------------------------------------------------------------------------------------
|
||||||
|
restartBamboo.rb - a script used to restart Bamboo after system failures
|
||||||
|
validateIndex.rb - a script for working with Tribble indexes, type 'validateIndex.rb' -h for help
|
||||||
|
|
||||||
|
utils/ - any basic utility methods
|
||||||
|
index/ - the index utility code
|
||||||
|
|
@ -0,0 +1,70 @@
|
||||||
|
require './util/BinaryFileReader'
|
||||||
|
# this base class for all index types (at least linear and tree)
|
||||||
|
class Index
|
||||||
|
attr_reader :type, :headerVersion, :fileName, :fileSize, :t5, :md5, :flags
|
||||||
|
|
||||||
|
# construct, given a file name
|
||||||
|
def initialize(fileName)
|
||||||
|
@inputFile = fileName
|
||||||
|
@file = BinaryFileReader.new(fileName)
|
||||||
|
magic = @file.readBytes(4)
|
||||||
|
if (magic != "TIDX")
|
||||||
|
print "#{@inputFile}: !! Magic number is not what we expected, TIDX, instead we saw #{magic} !!\n"
|
||||||
|
exit(1)
|
||||||
|
end
|
||||||
|
@type = @file.readInt()
|
||||||
|
@headerVersion = @file.readInt()
|
||||||
|
@fileName = @file.readString()
|
||||||
|
@fileSize = @file.readLong()
|
||||||
|
@ts = @file.readLong()
|
||||||
|
@md5 = @file.readString()
|
||||||
|
@flags = @file.readUInt()
|
||||||
|
@seqDict = readSeqDictionary() if (@flags == 32768)
|
||||||
|
@propCount = readPropertyDictionary() if (@headerVersion >= 3)
|
||||||
|
end
|
||||||
|
|
||||||
|
def validate()
|
||||||
|
f = Proc.new{ print "#{@inputFile}:\t\terror: invalid type, we saw #{@type} but expected [1-2]\n"; return false} if @type < 1 or @type > 2
|
||||||
|
f = Proc.new{ print "#{@inputFile}:\t\terror: invalid header version, we saw #{@headerVersion} but expected [1-3]\n"; return false} if @headerVersion < 1 or @headerVersion > 3
|
||||||
|
f = Proc.new{ print "#{@inputFile}:\t\twarning: on fileName, we saw '#{@fileName}' but expected actual text\n"; return false} if @fileName == ""
|
||||||
|
f = Proc.new{ print "#{@inputFile}:\t\twarning: on TS, we saw '#{@ts}' but expected actual text\n"; return false} if @ts == ""
|
||||||
|
f = Proc.new{ print "#{@inputFile}:\t\twarning: on md5, we saw '#{@md5}' but expected actual text\n"; return false} if @md5 == ""
|
||||||
|
f.call if f != nil
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
# diff two headers
|
||||||
|
def diffHeader(otherIndex)
|
||||||
|
self.instance_variables.each { |var|
|
||||||
|
next if "#{var}" == "@file" or "#{var}" == "@sequences"
|
||||||
|
puts "Other header doesn't define #{var}" if !(otherIndex.instance_variable_defined?(var))
|
||||||
|
one = (self.instance_variable_get(var)).to_s
|
||||||
|
two = (otherIndex.instance_variable_get(var)).to_s
|
||||||
|
puts "type #{var} not equal, #{one} != #{two}" if one != two
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
# read the sequence dictionary, assuming we have one
|
||||||
|
def readSeqDictionary()
|
||||||
|
sequences = []
|
||||||
|
count = @file.readInt()
|
||||||
|
count.times {|index|
|
||||||
|
sequences.add(@file.readString())
|
||||||
|
@file.readInt() # drop the sizes for now
|
||||||
|
}
|
||||||
|
sequences # return sequences
|
||||||
|
end
|
||||||
|
|
||||||
|
# read the sequence dictionary, assuming we have one
|
||||||
|
def readPropertyDictionary()
|
||||||
|
sequences = {}
|
||||||
|
count = @file.readInt()
|
||||||
|
count.times {|index|
|
||||||
|
sequences.put(@file.readString(),@file.readString()) }
|
||||||
|
sequences # return sequences
|
||||||
|
end
|
||||||
|
|
||||||
|
def close()
|
||||||
|
@file.close()
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
@ -0,0 +1,41 @@
|
||||||
|
# the implementation of the interval index class
|
||||||
|
$LOAD_PATH << File.dirname(__FILE__)
|
||||||
|
require "Index.rb"
|
||||||
|
|
||||||
|
class IntervalIndex < Index
|
||||||
|
def initialize(file)
|
||||||
|
super(file)
|
||||||
|
@nSeq = @file.readInt()
|
||||||
|
@sequences = Array.new()
|
||||||
|
@nSeq.times {|index|
|
||||||
|
@sequences.push(TISeqEntry.new(@file))
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
def diff(otherIndex)
|
||||||
|
diffHeader(otherIndex)
|
||||||
|
if (otherIndex.type != @type)
|
||||||
|
print "Indexes are not the same type (#{otherIndex.type} != #{@type})\n"
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
ret = false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
class TISeqEntry
|
||||||
|
def initialize(file)
|
||||||
|
@contig = file.readString()
|
||||||
|
@binCount = file.readInt()
|
||||||
|
@startPositions = Array.new()
|
||||||
|
@endPositions = Array.new()
|
||||||
|
@positions = Array.new()
|
||||||
|
@sizes = Array.new()
|
||||||
|
@binCount.times { |index|
|
||||||
|
@startPositions.push(file.readInt())
|
||||||
|
@endPositions.push(file.readInt())
|
||||||
|
@positions.push(file.readLong())
|
||||||
|
@sizes.push(file.readInt())
|
||||||
|
}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
|
@ -0,0 +1,59 @@
|
||||||
|
# the linear index class implementation
|
||||||
|
$LOAD_PATH << File.dirname(__FILE__)
|
||||||
|
require "Index.rb"
|
||||||
|
|
||||||
|
class LinearIndex < Index
|
||||||
|
attr_accessor :nSeq, :sequences
|
||||||
|
def initialize(file)
|
||||||
|
super(file)
|
||||||
|
@nSeq = @file.readInt()
|
||||||
|
@sequences = Array.new()
|
||||||
|
@nSeq.times {|index|
|
||||||
|
@sequences.push(LISeqEntry.new(@file))
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
def diff(otherIndex)
|
||||||
|
diffHeader(otherIndex)
|
||||||
|
if (otherIndex.type != @type)
|
||||||
|
print "Indexes are not the same type (#{otherIndex.type} != #{@type})\n"
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
ret = false
|
||||||
|
notInOther = @sequences.reject {|item|
|
||||||
|
return true if !otherIndex.sequences.include?(item)
|
||||||
|
item.diff(otherIndex.sequences[otherIndex.sequences.index(item)])
|
||||||
|
}
|
||||||
|
notInOther.pretty_print
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
class LISeqEntry
|
||||||
|
def initialize(file)
|
||||||
|
@contig = file.readString()
|
||||||
|
@binWidth = file.readInt()
|
||||||
|
@binCount = file.readInt()
|
||||||
|
@longestFeature = file.readInt()
|
||||||
|
@maxBin = file.readInt()
|
||||||
|
@totalBin = file.readInt()
|
||||||
|
@startPositions = Array.new()
|
||||||
|
@binCount.times { |index|
|
||||||
|
@startPositions.push(file.readLong())
|
||||||
|
}
|
||||||
|
@finalPos = file.readLong()
|
||||||
|
end
|
||||||
|
|
||||||
|
# print a summary of the index characteristics
|
||||||
|
def diff(otherLISeqEntry)
|
||||||
|
self.instance_variables.each { |var|
|
||||||
|
next if "#{var}" == "@file" or "#{var}" == "@sequences"
|
||||||
|
puts "Other LISeqEntry doesn't define #{var}" if !(otherLISeqEntry.instance_variable_defined?(var))
|
||||||
|
one = (self.instance_variable_get(var)).to_s
|
||||||
|
two = (otherLISeqEntry.instance_variable_get(var)).to_s
|
||||||
|
puts "otherLISeqEntry: type #{var} not equal, #{one} != #{two}" if one != two
|
||||||
|
}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
|
@ -0,0 +1,170 @@
|
||||||
|
# this ruby files takes two indexes (of the same type) and diff's them. If they're different types,
|
||||||
|
# it'll stop after the header
|
||||||
|
|
||||||
|
# a function to exit, printing a message
|
||||||
|
def exitWithError(message)
|
||||||
|
puts ###########################################
|
||||||
|
puts message
|
||||||
|
puts ###########################################
|
||||||
|
exit(1)
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
if (ARGV.size != 2)
|
||||||
|
exitWithError("We take two files as input, try again!")
|
||||||
|
end
|
||||||
|
|
||||||
|
# open the indexes
|
||||||
|
index1 = File.new(ARGV[0])
|
||||||
|
index2 = File.new(ARGV[1])
|
||||||
|
|
||||||
|
# a helper function for comparing values
|
||||||
|
def compValues(file1, file2, byteCount, type)
|
||||||
|
index1Type = file1.sysread(byteCount)
|
||||||
|
index2Type = file2.sysread(byteCount)
|
||||||
|
if (index1Type != index2Type)
|
||||||
|
print "#{type}, index1 (#{index1Type}) != index2 (#{index2Type})\n"
|
||||||
|
else
|
||||||
|
print "#{type}, index1 (#{index1Type}) == index2 (#{index2Type})\n"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# a helper function for comparing values
|
||||||
|
def compInts(file1, file2, byteCount, type)
|
||||||
|
if (byteCount > 4)
|
||||||
|
upack = "q"
|
||||||
|
else
|
||||||
|
upack = "i"
|
||||||
|
end
|
||||||
|
index1Type = file1.sysread(byteCount).unpack(upack)
|
||||||
|
index2Type = file2.sysread(byteCount).unpack(upack)
|
||||||
|
if (index1Type != index2Type)
|
||||||
|
print "#{type}, index1 (#{index1Type}) != index2 (#{index2Type})\n"
|
||||||
|
else
|
||||||
|
print "#{type}, index1 (#{index1Type}) == index2 (#{index2Type})\n"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# a helper function for reading strings
|
||||||
|
def readString(index)
|
||||||
|
buffer = []
|
||||||
|
ch = index.sysread(1)
|
||||||
|
while (ch != "\0")
|
||||||
|
buffer.push(ch)
|
||||||
|
ch = index.sysread(1)
|
||||||
|
end
|
||||||
|
buffer.to_s
|
||||||
|
end
|
||||||
|
|
||||||
|
# validate the magic number from both
|
||||||
|
exitWithError("Magic number not valid for index 1") if index1.sysread(4) != "TIDX"
|
||||||
|
exitWithError("Magic number not valid for index 2") if index2.sysread(4) != "TIDX"
|
||||||
|
|
||||||
|
# validate the types
|
||||||
|
compInts(index1,index2,4,"types")
|
||||||
|
|
||||||
|
# validate the versions
|
||||||
|
v1 = index1.sysread(4).unpack("i")
|
||||||
|
v2 = index2.sysread(4).unpack("i")
|
||||||
|
if (v1 != v2)
|
||||||
|
print "version, index1 (#{v1}) != index2 (#{v2})\n"
|
||||||
|
else
|
||||||
|
print "version, index1 (#{v1}) == index2 (#{v2})\n"
|
||||||
|
end
|
||||||
|
|
||||||
|
# validate the filenames
|
||||||
|
fl1 = readString(index1)
|
||||||
|
fl2 = readString(index2)
|
||||||
|
if (fl1 != fl2)
|
||||||
|
print "filename, index1 (#{fl1}) != index2 (#{fl2})\n"
|
||||||
|
else
|
||||||
|
print "filename, index1 (#{fl1}) == index2 (#{fl2})\n"
|
||||||
|
end
|
||||||
|
# validate the sizes
|
||||||
|
compInts(index1,index2,8,"sizes")
|
||||||
|
|
||||||
|
|
||||||
|
# validate the T5?
|
||||||
|
compValues(index1,index2,8,"T5")
|
||||||
|
|
||||||
|
# validate the MD5 - just a byte, we don't write the MD5 sums in yet
|
||||||
|
# validate the filenames
|
||||||
|
fl1 = readString(index1)
|
||||||
|
fl2 = readString(index2)
|
||||||
|
if (fl1 != fl2)
|
||||||
|
print "md5, index1 (#{fl1}) != index2 (#{fl2})\n"
|
||||||
|
else
|
||||||
|
print "md5, index1 (#{fl1}) == index2 (#{fl2})\n"
|
||||||
|
end
|
||||||
|
|
||||||
|
# validate the flags
|
||||||
|
index1Flags = (index1.sysread(4)).unpack("L")
|
||||||
|
index2Flags = (index2.sysread(4)).unpack("L")
|
||||||
|
if (index1Flags != index2Flags)
|
||||||
|
print "Flags are different, index1 = #{index1Flags[0]}, index2 = #{index2Flags[0]}\n"
|
||||||
|
end
|
||||||
|
|
||||||
|
def readSeqDictionary(file)
|
||||||
|
puts "reading seq dict"
|
||||||
|
sequences = []
|
||||||
|
count = (file.sysread(4)).unpack("i")
|
||||||
|
puts count
|
||||||
|
count[0].times {|index|
|
||||||
|
sequences.add(readString(file).to_s)
|
||||||
|
file.sysread(4) # drop the sizes for now
|
||||||
|
}
|
||||||
|
sequences # return sequences
|
||||||
|
end
|
||||||
|
|
||||||
|
if (index1Flags[0] == 32768)
|
||||||
|
puts readSeqDictionary(index1)
|
||||||
|
elsif (index2Flags[0] == 32768)
|
||||||
|
puts readSeqDictionary(index2)
|
||||||
|
end
|
||||||
|
|
||||||
|
# bump off the prop dictionary
|
||||||
|
index1.sysread(4) if (v1[0] == 3)
|
||||||
|
index2.sysread(4) if (v2[0] == 3)
|
||||||
|
|
||||||
|
def readSeqEntry(i1)
|
||||||
|
puts "--------------------------------------------"
|
||||||
|
puts "Contig --> #{readString(i1).to_s}"
|
||||||
|
print "bin width = #{(i1.sysread(4)).unpack("i")}\n"
|
||||||
|
binCount = (i1.sysread(4)).unpack("i")[0]
|
||||||
|
print "number of bins = #{binCount}\n"
|
||||||
|
print "longest feature = #{(i1.sysread(4)).unpack("i")}\n"
|
||||||
|
print "max bin size = #{(i1.sysread(4)).unpack("i")}\n"
|
||||||
|
print "total bin size = #{(i1.sysread(4)).unpack("i")}\n"
|
||||||
|
lastStartPos = -1
|
||||||
|
binCount.times { |index|
|
||||||
|
startPos = (i1.sysread(8)).unpack("q")[0]
|
||||||
|
#if (startPos < lastStartPos)
|
||||||
|
puts "bin at index #{index}, this start = #{startPos}, last start = #{lastStartPos}"
|
||||||
|
#end
|
||||||
|
lastStartPos = startPos
|
||||||
|
}
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
def compContigLinear(i1, i2)
|
||||||
|
seqC1 = (i1.sysread(4)).unpack("L")
|
||||||
|
seqC2 = (i2.sysread(4)).unpack("L")
|
||||||
|
print "seq count 1 = #{seqC1}, count 2 = #{seqC2}\n"
|
||||||
|
puts "\nentries for index 1"
|
||||||
|
seqC1[0].times { |index|
|
||||||
|
readSeqEntry(i1)
|
||||||
|
}
|
||||||
|
puts "\nentries for index 2"
|
||||||
|
seqC2[0].times { |index|
|
||||||
|
readSeqEntry(i2)
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
compContigLinear(index1,index2)
|
||||||
|
|
||||||
|
|
||||||
|
print "Done!\n"
|
||||||
|
# close the files
|
||||||
|
index1.close()
|
||||||
|
index2.close()
|
||||||
|
|
@ -0,0 +1,44 @@
|
||||||
|
# a ruby class for reading in binary files; really this just adds some conv. methods like readInt(), readLong(), etc.
|
||||||
|
class BinaryFileReader
|
||||||
|
# constructor
|
||||||
|
def initialize(fileName)
|
||||||
|
@file = File.open(fileName,"r")
|
||||||
|
end
|
||||||
|
|
||||||
|
# read and return an int (4 byte, signed, machine based endian)
|
||||||
|
def readInt()
|
||||||
|
(@file.sysread(4)).unpack("i")[0]
|
||||||
|
end
|
||||||
|
|
||||||
|
# read and return an int (4 byte, unsigned, machine based endian)
|
||||||
|
def readUInt()
|
||||||
|
(@file.sysread(4)).unpack("L")[0]
|
||||||
|
end
|
||||||
|
|
||||||
|
# read and return an long (8 byte, signed, machine based endian)
|
||||||
|
def readLong()
|
||||||
|
(@file.sysread(8)).unpack("q")[0]
|
||||||
|
end
|
||||||
|
|
||||||
|
# read and return a set number of bytes as a string
|
||||||
|
def readBytes(count)
|
||||||
|
(@file.sysread(count)).to_s
|
||||||
|
end
|
||||||
|
|
||||||
|
# read and return a null terminated string
|
||||||
|
def readString()
|
||||||
|
buffer = []
|
||||||
|
ch = @file.sysread(1)
|
||||||
|
while (ch != "\0")
|
||||||
|
buffer.push(ch)
|
||||||
|
ch = @file.sysread(1)
|
||||||
|
end
|
||||||
|
buffer.to_s
|
||||||
|
end
|
||||||
|
|
||||||
|
# close the file
|
||||||
|
def close()
|
||||||
|
@file.close()
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
|
@ -0,0 +1,103 @@
|
||||||
|
# this ruby files validates a linear index
|
||||||
|
# set the include path to include the current directory
|
||||||
|
$LOAD_PATH << File.dirname(__FILE__)
|
||||||
|
|
||||||
|
# require a couple of files
|
||||||
|
require "index/Index.rb"
|
||||||
|
require "index/LinearIndex.rb"
|
||||||
|
require "index/IntervalIndex.rb"
|
||||||
|
require "optparse"
|
||||||
|
require "yaml"
|
||||||
|
|
||||||
|
# This hash will hold all of the options
|
||||||
|
# parsed from the command-line by
|
||||||
|
# OptionParser.
|
||||||
|
options = {}
|
||||||
|
|
||||||
|
optparse = OptionParser.new do|opts|
|
||||||
|
# Set a banner, displayed at the top
|
||||||
|
# of the help screen.
|
||||||
|
opts.banner = "Usage: ruby validateIndex.rb [options] file1 file2 ..."
|
||||||
|
|
||||||
|
# Define the options, and what they do
|
||||||
|
options[:index] = [] if options[:index] == nil
|
||||||
|
opts.on( '-i', '--index INDEX (REQUIRED)', 'Specify the index. Multiple are allowed' ) do |file| options[:index].push(file) end
|
||||||
|
|
||||||
|
options[:verbose] = false
|
||||||
|
opts.on( '-v', '--verbose', 'Output more information' ) do options[:verbose] = true end
|
||||||
|
|
||||||
|
options[:validate] = false
|
||||||
|
opts.on( '-c', '--check', 'Check (Validate) the index(es) passed in as parameters' ) do options[:check] = true end
|
||||||
|
|
||||||
|
options[:diff] = false
|
||||||
|
opts.on( '-d', '--diff', 'Diff two indexes' ) do options[:diff] = true end
|
||||||
|
|
||||||
|
options[:print] = false
|
||||||
|
opts.on( '-p', '--print', 'Print all of the information about the file' ) do options[:print] = true end
|
||||||
|
|
||||||
|
# This displays the help screen, all programs are
|
||||||
|
# assumed to have this option.
|
||||||
|
opts.on_tail( '-h', '--help', 'Display this screen' ) do
|
||||||
|
puts opts
|
||||||
|
exit
|
||||||
|
end
|
||||||
|
end
|
||||||
|
# parse the command line
|
||||||
|
optparse.parse!
|
||||||
|
|
||||||
|
#Now raise an exception if we have not found a host option
|
||||||
|
if options[:index].size == 0
|
||||||
|
puts "you must at least specify an index file!"
|
||||||
|
puts optparse
|
||||||
|
end
|
||||||
|
|
||||||
|
# a function to load an index
|
||||||
|
def loadIndex(file)
|
||||||
|
indexTry = Index.new(file)
|
||||||
|
indexTry.close()
|
||||||
|
if (indexTry.type == 1)
|
||||||
|
puts "Linear index..."
|
||||||
|
index = LinearIndex.new(file)
|
||||||
|
else
|
||||||
|
puts "Interval index..."
|
||||||
|
index = IntervalIndex.new(file)
|
||||||
|
end
|
||||||
|
index
|
||||||
|
end
|
||||||
|
|
||||||
|
#################### Control Block ####################
|
||||||
|
|
||||||
|
# load all of the indexes
|
||||||
|
indexes = []
|
||||||
|
options[:index].each {|indexFile|
|
||||||
|
indexes.push(loadIndex(indexFile))
|
||||||
|
}
|
||||||
|
|
||||||
|
# switch on the flags supplied
|
||||||
|
if (options[:diff])
|
||||||
|
if (options[:index].size != 2)
|
||||||
|
print "Unable to diff indexes if you don't supply two and only two indexes\n";
|
||||||
|
exit(1)
|
||||||
|
else
|
||||||
|
indexes[0].diff(indexes[1])
|
||||||
|
end
|
||||||
|
elsif (options[:validate])
|
||||||
|
indexes.each {|index| index.validate() }
|
||||||
|
elsif (options[:print])
|
||||||
|
indexes.each {|index| puts YAML::dump( index ) }
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# if they specified validate
|
||||||
|
if (options[:check])
|
||||||
|
options[:index].each {|index|
|
||||||
|
idx = Index.new(index).validate()
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
exit
|
||||||
Loading…
Reference in New Issue