diff --git a/ruby/README b/ruby/README new file mode 100644 index 000000000..3b7740998 --- /dev/null +++ b/ruby/README @@ -0,0 +1,9 @@ +This file is a roadmap to the contents of the Ruby directory + +Contents: +------------------------------------------------------------------------------------ +restartBamboo.rb - a script used to restart Bamboo after system failures +validateIndex.rb - a script for working with Tribble indexes, type 'validateIndex.rb' -h for help + +utils/ - any basic utility methods +index/ - the index utility code diff --git a/ruby/index/Index.rb b/ruby/index/Index.rb new file mode 100644 index 000000000..e3216420b --- /dev/null +++ b/ruby/index/Index.rb @@ -0,0 +1,70 @@ +require './util/BinaryFileReader' +# this base class for all index types (at least linear and tree) +class Index + attr_reader :type, :headerVersion, :fileName, :fileSize, :t5, :md5, :flags + + # construct, given a file name + def initialize(fileName) + @inputFile = fileName + @file = BinaryFileReader.new(fileName) + magic = @file.readBytes(4) + if (magic != "TIDX") + print "#{@inputFile}: !! Magic number is not what we expected, TIDX, instead we saw #{magic} !!\n" + exit(1) + end + @type = @file.readInt() + @headerVersion = @file.readInt() + @fileName = @file.readString() + @fileSize = @file.readLong() + @ts = @file.readLong() + @md5 = @file.readString() + @flags = @file.readUInt() + @seqDict = readSeqDictionary() if (@flags == 32768) + @propCount = readPropertyDictionary() if (@headerVersion >= 3) + end + + def validate() + f = Proc.new{ print "#{@inputFile}:\t\terror: invalid type, we saw #{@type} but expected [1-2]\n"; return false} if @type < 1 or @type > 2 + f = Proc.new{ print "#{@inputFile}:\t\terror: invalid header version, we saw #{@headerVersion} but expected [1-3]\n"; return false} if @headerVersion < 1 or @headerVersion > 3 + f = Proc.new{ print "#{@inputFile}:\t\twarning: on fileName, we saw '#{@fileName}' but expected actual text\n"; return false} if @fileName == "" + f = Proc.new{ print "#{@inputFile}:\t\twarning: on TS, we saw '#{@ts}' but expected actual text\n"; return false} if @ts == "" + f = Proc.new{ print "#{@inputFile}:\t\twarning: on md5, we saw '#{@md5}' but expected actual text\n"; return false} if @md5 == "" + f.call if f != nil + return true + end + + # diff two headers + def diffHeader(otherIndex) + self.instance_variables.each { |var| + next if "#{var}" == "@file" or "#{var}" == "@sequences" + puts "Other header doesn't define #{var}" if !(otherIndex.instance_variable_defined?(var)) + one = (self.instance_variable_get(var)).to_s + two = (otherIndex.instance_variable_get(var)).to_s + puts "type #{var} not equal, #{one} != #{two}" if one != two + } + end + + # read the sequence dictionary, assuming we have one + def readSeqDictionary() + sequences = [] + count = @file.readInt() + count.times {|index| + sequences.add(@file.readString()) + @file.readInt() # drop the sizes for now + } + sequences # return sequences + end + + # read the sequence dictionary, assuming we have one + def readPropertyDictionary() + sequences = {} + count = @file.readInt() + count.times {|index| + sequences.put(@file.readString(),@file.readString()) } + sequences # return sequences + end + + def close() + @file.close() + end +end \ No newline at end of file diff --git a/ruby/index/IntervalIndex.rb b/ruby/index/IntervalIndex.rb new file mode 100644 index 000000000..5ee307bb0 --- /dev/null +++ b/ruby/index/IntervalIndex.rb @@ -0,0 +1,41 @@ +# the implementation of the interval index class +$LOAD_PATH << File.dirname(__FILE__) +require "Index.rb" + +class IntervalIndex < Index + def initialize(file) + super(file) + @nSeq = @file.readInt() + @sequences = Array.new() + @nSeq.times {|index| + @sequences.push(TISeqEntry.new(@file)) + } + end + + def diff(otherIndex) + diffHeader(otherIndex) + if (otherIndex.type != @type) + print "Indexes are not the same type (#{otherIndex.type} != #{@type})\n" + return false + end + ret = false + end +end + +class TISeqEntry + def initialize(file) + @contig = file.readString() + @binCount = file.readInt() + @startPositions = Array.new() + @endPositions = Array.new() + @positions = Array.new() + @sizes = Array.new() + @binCount.times { |index| + @startPositions.push(file.readInt()) + @endPositions.push(file.readInt()) + @positions.push(file.readLong()) + @sizes.push(file.readInt()) + } + end +end + diff --git a/ruby/index/LinearIndex.rb b/ruby/index/LinearIndex.rb new file mode 100644 index 000000000..14b9eee17 --- /dev/null +++ b/ruby/index/LinearIndex.rb @@ -0,0 +1,59 @@ +# the linear index class implementation +$LOAD_PATH << File.dirname(__FILE__) +require "Index.rb" + +class LinearIndex < Index + attr_accessor :nSeq, :sequences + def initialize(file) + super(file) + @nSeq = @file.readInt() + @sequences = Array.new() + @nSeq.times {|index| + @sequences.push(LISeqEntry.new(@file)) + } + end + + def diff(otherIndex) + diffHeader(otherIndex) + if (otherIndex.type != @type) + print "Indexes are not the same type (#{otherIndex.type} != #{@type})\n" + return false + end + ret = false + notInOther = @sequences.reject {|item| + return true if !otherIndex.sequences.include?(item) + item.diff(otherIndex.sequences[otherIndex.sequences.index(item)]) + } + notInOther.pretty_print + end + + +end + +class LISeqEntry + def initialize(file) + @contig = file.readString() + @binWidth = file.readInt() + @binCount = file.readInt() + @longestFeature = file.readInt() + @maxBin = file.readInt() + @totalBin = file.readInt() + @startPositions = Array.new() + @binCount.times { |index| + @startPositions.push(file.readLong()) + } + @finalPos = file.readLong() + end + + # print a summary of the index characteristics + def diff(otherLISeqEntry) + self.instance_variables.each { |var| + next if "#{var}" == "@file" or "#{var}" == "@sequences" + puts "Other LISeqEntry doesn't define #{var}" if !(otherLISeqEntry.instance_variable_defined?(var)) + one = (self.instance_variable_get(var)).to_s + two = (otherLISeqEntry.instance_variable_get(var)).to_s + puts "otherLISeqEntry: type #{var} not equal, #{one} != #{two}" if one != two + } + end +end + diff --git a/ruby/index/diffIndexes.rb b/ruby/index/diffIndexes.rb new file mode 100644 index 000000000..7ec0414fa --- /dev/null +++ b/ruby/index/diffIndexes.rb @@ -0,0 +1,170 @@ +# this ruby files takes two indexes (of the same type) and diff's them. If they're different types, +# it'll stop after the header + +# a function to exit, printing a message +def exitWithError(message) + puts ########################################### + puts message + puts ########################################### + exit(1) +end + + +if (ARGV.size != 2) + exitWithError("We take two files as input, try again!") +end + +# open the indexes +index1 = File.new(ARGV[0]) +index2 = File.new(ARGV[1]) + +# a helper function for comparing values +def compValues(file1, file2, byteCount, type) + index1Type = file1.sysread(byteCount) + index2Type = file2.sysread(byteCount) + if (index1Type != index2Type) + print "#{type}, index1 (#{index1Type}) != index2 (#{index2Type})\n" + else + print "#{type}, index1 (#{index1Type}) == index2 (#{index2Type})\n" + end +end + +# a helper function for comparing values +def compInts(file1, file2, byteCount, type) + if (byteCount > 4) + upack = "q" + else + upack = "i" + end + index1Type = file1.sysread(byteCount).unpack(upack) + index2Type = file2.sysread(byteCount).unpack(upack) + if (index1Type != index2Type) + print "#{type}, index1 (#{index1Type}) != index2 (#{index2Type})\n" + else + print "#{type}, index1 (#{index1Type}) == index2 (#{index2Type})\n" + end +end + + +# a helper function for reading strings +def readString(index) + buffer = [] + ch = index.sysread(1) + while (ch != "\0") + buffer.push(ch) + ch = index.sysread(1) + end + buffer.to_s +end + +# validate the magic number from both +exitWithError("Magic number not valid for index 1") if index1.sysread(4) != "TIDX" +exitWithError("Magic number not valid for index 2") if index2.sysread(4) != "TIDX" + +# validate the types +compInts(index1,index2,4,"types") + +# validate the versions +v1 = index1.sysread(4).unpack("i") +v2 = index2.sysread(4).unpack("i") +if (v1 != v2) + print "version, index1 (#{v1}) != index2 (#{v2})\n" +else + print "version, index1 (#{v1}) == index2 (#{v2})\n" +end + +# validate the filenames +fl1 = readString(index1) +fl2 = readString(index2) +if (fl1 != fl2) + print "filename, index1 (#{fl1}) != index2 (#{fl2})\n" +else + print "filename, index1 (#{fl1}) == index2 (#{fl2})\n" +end +# validate the sizes +compInts(index1,index2,8,"sizes") + + +# validate the T5? +compValues(index1,index2,8,"T5") + +# validate the MD5 - just a byte, we don't write the MD5 sums in yet +# validate the filenames +fl1 = readString(index1) +fl2 = readString(index2) +if (fl1 != fl2) + print "md5, index1 (#{fl1}) != index2 (#{fl2})\n" +else + print "md5, index1 (#{fl1}) == index2 (#{fl2})\n" +end + +# validate the flags +index1Flags = (index1.sysread(4)).unpack("L") +index2Flags = (index2.sysread(4)).unpack("L") +if (index1Flags != index2Flags) + print "Flags are different, index1 = #{index1Flags[0]}, index2 = #{index2Flags[0]}\n" +end + +def readSeqDictionary(file) + puts "reading seq dict" + sequences = [] + count = (file.sysread(4)).unpack("i") + puts count + count[0].times {|index| + sequences.add(readString(file).to_s) + file.sysread(4) # drop the sizes for now + } + sequences # return sequences +end + +if (index1Flags[0] == 32768) + puts readSeqDictionary(index1) +elsif (index2Flags[0] == 32768) + puts readSeqDictionary(index2) +end + +# bump off the prop dictionary +index1.sysread(4) if (v1[0] == 3) +index2.sysread(4) if (v2[0] == 3) + +def readSeqEntry(i1) + puts "--------------------------------------------" + puts "Contig --> #{readString(i1).to_s}" + print "bin width = #{(i1.sysread(4)).unpack("i")}\n" + binCount = (i1.sysread(4)).unpack("i")[0] + print "number of bins = #{binCount}\n" + print "longest feature = #{(i1.sysread(4)).unpack("i")}\n" + print "max bin size = #{(i1.sysread(4)).unpack("i")}\n" + print "total bin size = #{(i1.sysread(4)).unpack("i")}\n" + lastStartPos = -1 + binCount.times { |index| + startPos = (i1.sysread(8)).unpack("q")[0] + #if (startPos < lastStartPos) + puts "bin at index #{index}, this start = #{startPos}, last start = #{lastStartPos}" + #end + lastStartPos = startPos + } + +end + +def compContigLinear(i1, i2) + seqC1 = (i1.sysread(4)).unpack("L") + seqC2 = (i2.sysread(4)).unpack("L") + print "seq count 1 = #{seqC1}, count 2 = #{seqC2}\n" + puts "\nentries for index 1" + seqC1[0].times { |index| + readSeqEntry(i1) + } + puts "\nentries for index 2" + seqC2[0].times { |index| + readSeqEntry(i2) + } +end + +compContigLinear(index1,index2) + + +print "Done!\n" +# close the files +index1.close() +index2.close() \ No newline at end of file diff --git a/ruby/util/BinaryFileReader.rb b/ruby/util/BinaryFileReader.rb new file mode 100644 index 000000000..b29f90452 --- /dev/null +++ b/ruby/util/BinaryFileReader.rb @@ -0,0 +1,44 @@ +# a ruby class for reading in binary files; really this just adds some conv. methods like readInt(), readLong(), etc. +class BinaryFileReader + # constructor + def initialize(fileName) + @file = File.open(fileName,"r") + end + + # read and return an int (4 byte, signed, machine based endian) + def readInt() + (@file.sysread(4)).unpack("i")[0] + end + + # read and return an int (4 byte, unsigned, machine based endian) + def readUInt() + (@file.sysread(4)).unpack("L")[0] + end + + # read and return an long (8 byte, signed, machine based endian) + def readLong() + (@file.sysread(8)).unpack("q")[0] + end + + # read and return a set number of bytes as a string + def readBytes(count) + (@file.sysread(count)).to_s + end + + # read and return a null terminated string + def readString() + buffer = [] + ch = @file.sysread(1) + while (ch != "\0") + buffer.push(ch) + ch = @file.sysread(1) + end + buffer.to_s + end + + # close the file + def close() + @file.close() + end +end + diff --git a/ruby/validateIndex.rb b/ruby/validateIndex.rb new file mode 100644 index 000000000..4aa696911 --- /dev/null +++ b/ruby/validateIndex.rb @@ -0,0 +1,103 @@ +# this ruby files validates a linear index +# set the include path to include the current directory +$LOAD_PATH << File.dirname(__FILE__) + +# require a couple of files +require "index/Index.rb" +require "index/LinearIndex.rb" +require "index/IntervalIndex.rb" +require "optparse" +require "yaml" + +# This hash will hold all of the options +# parsed from the command-line by +# OptionParser. +options = {} + +optparse = OptionParser.new do|opts| + # Set a banner, displayed at the top + # of the help screen. + opts.banner = "Usage: ruby validateIndex.rb [options] file1 file2 ..." + + # Define the options, and what they do + options[:index] = [] if options[:index] == nil + opts.on( '-i', '--index INDEX (REQUIRED)', 'Specify the index. Multiple are allowed' ) do |file| options[:index].push(file) end + + options[:verbose] = false + opts.on( '-v', '--verbose', 'Output more information' ) do options[:verbose] = true end + + options[:validate] = false + opts.on( '-c', '--check', 'Check (Validate) the index(es) passed in as parameters' ) do options[:check] = true end + + options[:diff] = false + opts.on( '-d', '--diff', 'Diff two indexes' ) do options[:diff] = true end + + options[:print] = false + opts.on( '-p', '--print', 'Print all of the information about the file' ) do options[:print] = true end + + # This displays the help screen, all programs are + # assumed to have this option. + opts.on_tail( '-h', '--help', 'Display this screen' ) do + puts opts + exit + end +end +# parse the command line +optparse.parse! + +#Now raise an exception if we have not found a host option +if options[:index].size == 0 + puts "you must at least specify an index file!" + puts optparse +end + +# a function to load an index +def loadIndex(file) + indexTry = Index.new(file) + indexTry.close() + if (indexTry.type == 1) + puts "Linear index..." + index = LinearIndex.new(file) + else + puts "Interval index..." + index = IntervalIndex.new(file) + end + index +end + +#################### Control Block #################### + +# load all of the indexes +indexes = [] +options[:index].each {|indexFile| + indexes.push(loadIndex(indexFile)) +} + +# switch on the flags supplied +if (options[:diff]) + if (options[:index].size != 2) + print "Unable to diff indexes if you don't supply two and only two indexes\n"; + exit(1) + else + indexes[0].diff(indexes[1]) + end +elsif (options[:validate]) + indexes.each {|index| index.validate() } +elsif (options[:print]) + indexes.each {|index| puts YAML::dump( index ) } +end + + + + +# if they specified validate +if (options[:check]) + options[:index].each {|index| + idx = Index.new(index).validate() + } +end + + + + +exit