adding some utility code I've found helpful when working the Tribble index code

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4108 348d0f76-0448-11de-a6fe-93d51630548a
2010-08-25 15:38:47 +00:00 · 2010-08-25 15:38:47 +00:00 · bc0826801c
parent 4eff69d95e
commit bc0826801c
7 changed files with 496 additions and 0 deletions
--- a/ruby/README
+++ b/ruby/README
@ -0,0 +1,9 @@
+This file is a roadmap to the contents of the Ruby directory
+
+Contents:
+------------------------------------------------------------------------------------
+restartBamboo.rb 		- a script used to restart Bamboo after system failures
+validateIndex.rb		- a script for working with Tribble indexes, type 'validateIndex.rb' -h for help
+
+utils/					- any basic utility methods
+index/					- the index utility code
--- a/ruby/index/Index.rb
+++ b/ruby/index/Index.rb
@ -0,0 +1,70 @@
+require './util/BinaryFileReader'
+# this base class for all index types (at least linear and tree)
+class Index
+	attr_reader :type, :headerVersion, :fileName, :fileSize, :t5, :md5, :flags 
+
+	# construct, given a file name
+	def initialize(fileName)
+		@inputFile = fileName 
+		@file = BinaryFileReader.new(fileName)
+		magic = @file.readBytes(4)
+		if (magic != "TIDX")
+			print "#{@inputFile}: !! Magic number is not what we expected, TIDX, instead we saw #{magic} !!\n"
+			exit(1)
+		end
+		@type = @file.readInt()
+		@headerVersion = @file.readInt()
+		@fileName = @file.readString()
+		@fileSize = @file.readLong()
+		@ts = @file.readLong()
+		@md5 = @file.readString()
+		@flags = @file.readUInt()
+		@seqDict = readSeqDictionary() if (@flags == 32768)
+		@propCount = readPropertyDictionary() if (@headerVersion >= 3) 
+	end
+	
+	def validate()
+		 f = Proc.new{ print "#{@inputFile}:\t\terror: invalid type, we saw #{@type} but expected [1-2]\n"; return false} if @type < 1 or @type > 2
+		 f = Proc.new{ print "#{@inputFile}:\t\terror: invalid header version, we saw #{@headerVersion} but expected [1-3]\n"; return false} if @headerVersion < 1 or @headerVersion > 3
+		 f = Proc.new{ print "#{@inputFile}:\t\twarning: on fileName, we saw '#{@fileName}' but expected actual text\n"; return false} if @fileName == ""
+		 f = Proc.new{ print "#{@inputFile}:\t\twarning: on TS, we saw '#{@ts}' but expected actual text\n"; return false} if @ts == ""
+		 f = Proc.new{ print "#{@inputFile}:\t\twarning: on md5, we saw '#{@md5}' but expected actual text\n"; return false} if @md5 == ""
+		 f.call if f != nil
+		 return true
+	end 
+	
+	# diff two headers
+	def diffHeader(otherIndex)
+		self.instance_variables.each { |var|
+			next if "#{var}" == "@file" or  "#{var}" == "@sequences"
+			puts "Other header doesn't define #{var}" if !(otherIndex.instance_variable_defined?(var))
+			one = (self.instance_variable_get(var)).to_s
+			two = (otherIndex.instance_variable_get(var)).to_s
+			puts "type #{var} not equal, #{one} != #{two}" if one != two
+		}
+	end
+	
+	# read the sequence dictionary, assuming we have one
+	def readSeqDictionary()
+		sequences = []
+		count = @file.readInt()
+		count.times {|index|
+			sequences.add(@file.readString())
+			@file.readInt() # drop the sizes for now
+		}
+		sequences # return sequences
+	end
+	
+	# read the sequence dictionary, assuming we have one
+	def readPropertyDictionary()
+		sequences = {}
+		count = @file.readInt()
+		count.times {|index|
+			sequences.put(@file.readString(),@file.readString()) }
+		sequences # return sequences
+	end
+	
+	def close()
+		@file.close()
+	end
+end
--- a/ruby/index/IntervalIndex.rb
+++ b/ruby/index/IntervalIndex.rb
@ -0,0 +1,41 @@
+# the implementation of the interval index class
+$LOAD_PATH << File.dirname(__FILE__)
+require "Index.rb"
+
+class IntervalIndex < Index
+	def initialize(file)
+		super(file)
+		@nSeq = @file.readInt()
+		@sequences = Array.new()
+		@nSeq.times {|index|
+			@sequences.push(TISeqEntry.new(@file))
+		}
+	end
+	
+	def diff(otherIndex)
+		diffHeader(otherIndex)
+		if (otherIndex.type != @type)
+			print "Indexes are not the same type (#{otherIndex.type} != #{@type})\n"
+			return false
+		end
+		ret = false
+	end	
+end
+
+class TISeqEntry
+	def initialize(file)
+		@contig = file.readString()
+		@binCount = file.readInt()
+ 		@startPositions = Array.new()
+ 		@endPositions = Array.new()
+ 		@positions = Array.new()
+ 		@sizes = Array.new()
+ 		@binCount.times { |index|
+ 			@startPositions.push(file.readInt())
+ 			@endPositions.push(file.readInt())
+ 			@positions.push(file.readLong())
+ 			@sizes.push(file.readInt())
+ 		} 		
+ 	end
+end
+
--- a/ruby/index/LinearIndex.rb
+++ b/ruby/index/LinearIndex.rb
@ -0,0 +1,59 @@
+# the linear index class implementation
+$LOAD_PATH << File.dirname(__FILE__)
+require "Index.rb"
+
+class LinearIndex < Index
+	attr_accessor :nSeq, :sequences
+	def initialize(file)
+		super(file)
+		@nSeq = @file.readInt()
+		@sequences = Array.new()
+		@nSeq.times {|index|
+			@sequences.push(LISeqEntry.new(@file))
+		}
+	end
+	
+	def diff(otherIndex)
+		diffHeader(otherIndex)
+		if (otherIndex.type != @type)
+			print "Indexes are not the same type (#{otherIndex.type} != #{@type})\n"
+			return false
+		end
+		ret = false
+		notInOther = @sequences.reject {|item|
+								return true if !otherIndex.sequences.include?(item) 
+						 		item.diff(otherIndex.sequences[otherIndex.sequences.index(item)])
+						 	}
+		notInOther.pretty_print
+	end	
+		
+			
+end
+
+class LISeqEntry
+	def initialize(file)
+		@contig = file.readString()
+		@binWidth = file.readInt()
+ 		@binCount = file.readInt()
+ 		@longestFeature = file.readInt()
+ 		@maxBin = file.readInt()
+ 		@totalBin = file.readInt()
+ 		@startPositions = Array.new()
+ 		@binCount.times { |index|
+ 			@startPositions.push(file.readLong())
+ 		}
+ 		@finalPos = file.readLong()
+ 	end
+ 	
+ 	# print a summary of the index characteristics
+	def diff(otherLISeqEntry)
+		self.instance_variables.each { |var|
+			next if "#{var}" == "@file" or  "#{var}" == "@sequences"
+			puts "Other LISeqEntry doesn't define #{var}" if !(otherLISeqEntry.instance_variable_defined?(var))
+			one = (self.instance_variable_get(var)).to_s
+			two = (otherLISeqEntry.instance_variable_get(var)).to_s
+			puts "otherLISeqEntry: type #{var} not equal, #{one} != #{two}" if one != two
+		}
+	end
+end
+
--- a/ruby/index/diffIndexes.rb
+++ b/ruby/index/diffIndexes.rb
@ -0,0 +1,170 @@
+# this ruby files takes two indexes (of the same type) and diff's them.  If they're different types,
+# it'll stop after the header
+
+# a function to exit, printing a message
+def exitWithError(message) 
+	puts ###########################################
+	puts message
+	puts ###########################################
+	exit(1)
+end
+
+
+if (ARGV.size != 2)
+	exitWithError("We take two files as input, try again!")	
+end
+
+# open the indexes
+index1 = File.new(ARGV[0])
+index2 = File.new(ARGV[1])
+
+# a helper function for comparing values
+def compValues(file1, file2, byteCount, type)
+	index1Type = file1.sysread(byteCount)
+	index2Type = file2.sysread(byteCount)
+	if (index1Type != index2Type)
+		print "#{type}, index1 (#{index1Type}) != index2 (#{index2Type})\n"
+	else 
+		print "#{type}, index1 (#{index1Type}) == index2 (#{index2Type})\n"
+	end
+end
+
+# a helper function for comparing values
+def compInts(file1, file2, byteCount, type)
+	if (byteCount > 4) 
+		upack = "q"
+	else 
+		upack = "i"
+	end
+	index1Type = file1.sysread(byteCount).unpack(upack)
+	index2Type = file2.sysread(byteCount).unpack(upack)
+	if (index1Type != index2Type)
+		print "#{type}, index1 (#{index1Type}) != index2 (#{index2Type})\n"
+	else 
+		print "#{type}, index1 (#{index1Type}) == index2 (#{index2Type})\n"
+	end
+end
+
+
+# a helper function for reading strings
+def readString(index)
+	buffer = []
+	ch = index.sysread(1)
+	while (ch != "\0")
+		buffer.push(ch)
+		ch = index.sysread(1)
+	end
+	buffer.to_s
+end
+
+# validate the magic number from both
+exitWithError("Magic number not valid for index 1") if index1.sysread(4) != "TIDX"
+exitWithError("Magic number not valid for index 2") if index2.sysread(4) != "TIDX"
+
+# validate the types
+compInts(index1,index2,4,"types")
+
+# validate the versions
+v1 = index1.sysread(4).unpack("i")
+v2 = index2.sysread(4).unpack("i")
+if (v1 != v2)
+	print "version, index1 (#{v1}) != index2 (#{v2})\n"
+else 
+	print "version, index1 (#{v1}) == index2 (#{v2})\n"
+end
+
+# validate the filenames
+fl1 = readString(index1)
+fl2 = readString(index2)
+if (fl1 != fl2)
+	print "filename, index1 (#{fl1}) != index2 (#{fl2})\n"
+else 
+	print "filename, index1 (#{fl1}) == index2 (#{fl2})\n"
+end
+# validate the sizes
+compInts(index1,index2,8,"sizes")
+
+
+# validate the T5?
+compValues(index1,index2,8,"T5")
+
+# validate the MD5 - just a byte, we don't write the MD5 sums in yet
+# validate the filenames
+fl1 = readString(index1)
+fl2 = readString(index2)
+if (fl1 != fl2)
+	print "md5, index1 (#{fl1}) != index2 (#{fl2})\n"
+else 
+	print "md5, index1 (#{fl1}) == index2 (#{fl2})\n"
+end
+
+# validate the flags
+index1Flags = (index1.sysread(4)).unpack("L")
+index2Flags = (index2.sysread(4)).unpack("L")
+if (index1Flags != index2Flags)
+		print "Flags are different, index1 = #{index1Flags[0]}, index2 = #{index2Flags[0]}\n"
+end
+		
+def readSeqDictionary(file)
+	puts "reading seq dict"
+	sequences = []
+	count = (file.sysread(4)).unpack("i")	
+	puts count
+	count[0].times {|index|
+		sequences.add(readString(file).to_s)
+		file.sysread(4) # drop the sizes for now
+	}
+	sequences # return sequences
+end
+
+if (index1Flags[0] == 32768)
+	puts readSeqDictionary(index1)
+elsif (index2Flags[0] == 32768)
+	puts readSeqDictionary(index2)
+end
+
+# bump off the prop dictionary
+index1.sysread(4) if (v1[0] == 3) 
+index2.sysread(4) if (v2[0] == 3) 
+
+def readSeqEntry(i1)
+    puts "--------------------------------------------" 
+	puts "Contig --> #{readString(i1).to_s}"
+	print "bin width         = #{(i1.sysread(4)).unpack("i")}\n"
+	binCount = (i1.sysread(4)).unpack("i")[0]
+	print "number of bins    = #{binCount}\n"		
+	print "longest feature   = #{(i1.sysread(4)).unpack("i")}\n"
+	print "max bin size      = #{(i1.sysread(4)).unpack("i")}\n"
+	print "total bin size    = #{(i1.sysread(4)).unpack("i")}\n"
+	lastStartPos = -1
+	binCount.times { |index|
+		startPos = (i1.sysread(8)).unpack("q")[0]	
+		#if (startPos < lastStartPos)
+			puts "bin at index #{index}, this start = #{startPos}, last start = #{lastStartPos}"
+		#end
+		lastStartPos = startPos
+	}
+		
+end
+
+def compContigLinear(i1, i2)
+	seqC1 = (i1.sysread(4)).unpack("L")
+	seqC2 = (i2.sysread(4)).unpack("L")	
+	print "seq count 1 = #{seqC1}, count 2 = #{seqC2}\n"
+	puts "\nentries for index 1"
+	seqC1[0].times { |index|
+		readSeqEntry(i1)			
+	} 
+	puts "\nentries for index 2"	
+	seqC2[0].times { |index|
+		readSeqEntry(i2)	
+	} 
+end
+
+compContigLinear(index1,index2)		
+	
+
+print "Done!\n"
+# close the files
+index1.close()
+index2.close()
--- a/ruby/util/BinaryFileReader.rb
+++ b/ruby/util/BinaryFileReader.rb
@ -0,0 +1,44 @@
+# a ruby class for reading in binary files; really this just adds some conv. methods like readInt(), readLong(), etc.
+class BinaryFileReader
+	# constructor
+	def initialize(fileName)
+		@file = File.open(fileName,"r")
+	end
+
+	# read and return an int (4 byte, signed, machine based endian)
+	def readInt()
+		(@file.sysread(4)).unpack("i")[0]
+	end
+	
+	# read and return an int (4 byte, unsigned, machine based endian)
+	def readUInt()
+		(@file.sysread(4)).unpack("L")[0]
+	end
+	
+	# read and return an long (8 byte, signed, machine based endian)
+	def readLong()
+		(@file.sysread(8)).unpack("q")[0]
+	end
+	
+	# read and return a set number of bytes as a string
+	def readBytes(count)
+		(@file.sysread(count)).to_s
+	end
+	
+	# read and return a null terminated string
+	def readString()
+		buffer = []
+		ch = @file.sysread(1)
+		while (ch != "\0")
+			buffer.push(ch)
+			ch = @file.sysread(1)
+		end
+		buffer.to_s
+	end
+
+	# close the file
+	def close()
+		@file.close()
+	end
+end
+
--- a/ruby/validateIndex.rb
+++ b/ruby/validateIndex.rb
@ -0,0 +1,103 @@
+# this ruby files validates a linear index
+# set the include path to include the current directory
+$LOAD_PATH << File.dirname(__FILE__)
+
+# require a couple of files
+require "index/Index.rb"
+require "index/LinearIndex.rb"
+require "index/IntervalIndex.rb"
+require "optparse"
+require "yaml"
+
+# This hash will hold all of the options
+# parsed from the command-line by
+# OptionParser.
+options = {}
+
+optparse = OptionParser.new do|opts|
+  # Set a banner, displayed at the top
+  # of the help screen.
+  opts.banner = "Usage: ruby validateIndex.rb [options] file1 file2 ..."
+
+  # Define the options, and what they do
+  options[:index] = [] if options[:index] == nil
+  opts.on( '-i', '--index INDEX (REQUIRED)', 'Specify the index.  Multiple are allowed' ) do |file| options[:index].push(file) end
+
+  options[:verbose] = false
+  opts.on( '-v', '--verbose', 'Output more information' ) do options[:verbose] = true end
+
+  options[:validate] = false
+  opts.on( '-c', '--check', 'Check (Validate) the index(es) passed in as parameters' ) do options[:check] = true end
+  
+  options[:diff] = false
+  opts.on( '-d', '--diff', 'Diff two indexes' ) do options[:diff] = true end
+  
+  options[:print] = false
+  opts.on( '-p', '--print', 'Print all of the information about the file' ) do options[:print] = true end
+	
+  # This displays the help screen, all programs are
+  # assumed to have this option.
+  opts.on_tail( '-h', '--help', 'Display this screen' ) do
+    puts opts
+    exit
+  end
+end
+# parse the command line
+optparse.parse!
+
+#Now raise an exception if we have not found a host option
+if options[:index].size == 0
+	puts "you must at least specify an index file!"
+	puts optparse
+end
+
+# a function to load an index
+def loadIndex(file)
+	indexTry = Index.new(file)
+	indexTry.close()
+	if (indexTry.type == 1)
+		puts "Linear index..."
+		index = LinearIndex.new(file)
+	else
+		puts "Interval index..."
+		index = IntervalIndex.new(file)
+	end
+	index
+end
+
+#################### Control Block ####################
+
+# load all of the indexes
+indexes = []
+options[:index].each {|indexFile|
+	indexes.push(loadIndex(indexFile))
+}
+
+# switch on the flags supplied
+if (options[:diff])
+	if (options[:index].size != 2)
+		print "Unable to diff indexes if you don't supply two and only two indexes\n";
+		exit(1)
+	else
+		indexes[0].diff(indexes[1])
+	end
+elsif (options[:validate])
+	indexes.each {|index| index.validate() }
+elsif (options[:print])
+	indexes.each {|index| puts YAML::dump( index ) }
+end
+
+
+
+
+# if they specified validate
+if (options[:check])
+	options[:index].each {|index|
+		idx = Index.new(index).validate()
+	}
+end
+
+
+
+
+exit