From fd1d817d45c9a30117cb1876c2320c2fe9f494ec Mon Sep 17 00:00:00 2001 From: chartl Date: Thu, 16 Dec 2010 22:25:05 +0000 Subject: [PATCH] Cryptic implementation of base-string entropy. I suspect this scales ~linearly with length, so I may choose to normalize in the future. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4861 348d0f76-0448-11de-a6fe-93d51630548a --- scala/src/IntervalAnnotationWalker.scala | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/scala/src/IntervalAnnotationWalker.scala b/scala/src/IntervalAnnotationWalker.scala index 0c7f1ea3c..50f7fafde 100755 --- a/scala/src/IntervalAnnotationWalker.scala +++ b/scala/src/IntervalAnnotationWalker.scala @@ -99,9 +99,21 @@ class IntervalInfoBuilder(loc : GenomeLoc, minProp : Double) { finalized = true def isGC(b : Byte) : Int = if ( BaseUtils.gIndex == b || BaseUtils.cIndex == b ) { 1 } else { 0 } gcContent = baseContent.foldLeft[Int](0)( (a,b) => a + isGC(b)).asInstanceOf[Double]/location.size() - entropy = 0.0 // todo -- implement me + entropy = calcEntropy(baseContent.map(b => ListBuffer(b))) + calcEntropy(baseContent.reverse.map(b => ListBuffer(b))) val meta : String = metaData.reduceLeft(_ + "\t" + _) return "%s\t%d\t%d\t%.2f\t%.2f\t%s".format(location.getContig,location.getStart,location.getStop,gcContent,entropy,meta) } + def calcEntropy(byteList : ListBuffer[ListBuffer[Byte]]) : Double = { + if(byteList.size == 1) return 0 + Math.log(1+byteList.tail.size-byteList.tail.dropWhile( u => u.equals(byteList(1))).size) + + calcEntropy(byteList.tail.foldLeft(ListBuffer(byteList(0)))( (a,b) => { + if ( b.equals(byteList(1)) ) { + a.dropRight(1) + (a.last ++ b) + } else { + a + b + } + })) + } + } \ No newline at end of file