From 5398cf620aecd2715ff1423e761e75477f882afe Mon Sep 17 00:00:00 2001 From: chartl Date: Thu, 3 Feb 2011 17:32:46 +0000 Subject: [PATCH] Bug fixes in the in process function (spoiled by python: was not closing my writers). SortByRef now works somewhat like the perl script does, rather than doing a memory-expensive sort. Adding a QTools qscript which is kinda clunky, and will be used mostly for integration tests of these IPFs, pending some better way to construct argument collections and function accessors at compile-time. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5182 348d0f76-0448-11de-a6fe-93d51630548a --- scala/qscript/oneoffs/QTools.q | 68 +++++++++++++++++++ .../sting/queue/library/ipf/SortByRef.scala | 38 ++++++++--- .../library/ipf/vcf/VCFExtractIntervals.scala | 1 + .../library/ipf/vcf/VCFExtractSamples.scala | 3 +- .../library/ipf/vcf/VCFExtractSites.scala | 28 +++++++- 5 files changed, 127 insertions(+), 11 deletions(-) create mode 100755 scala/qscript/oneoffs/QTools.q diff --git a/scala/qscript/oneoffs/QTools.q b/scala/qscript/oneoffs/QTools.q new file mode 100755 index 000000000..84b4ef0fc --- /dev/null +++ b/scala/qscript/oneoffs/QTools.q @@ -0,0 +1,68 @@ +import org.broadinstitute.sting.queue.library.ipf.vcf.{VCFExtractIntervals, VCFExtractSamples, VCFSimpleMerge, VCFExtractSites} +import org.broadinstitute.sting.queue.library.ipf.SortByRef +import org.broadinstitute.sting.queue.QScript +import collection.JavaConversions._ + +// todo -- should the argument collection on which this runs be generated at compile-time into extensions?? +// todo -- maybe a compile-time generated enum of available library functions? (ipf of course) +class QTools extends QScript { + @Argument(doc="Tool to run",shortName="T", required=true) var qtool : String = _ + @Argument(doc="input VCF",shortName="ivcf",required=false) var inVCF : File = _ + @Argument(doc="input VCF files",shortName="vcfs",required=false) var inVCFs : String = _ + @Argument(doc="output file",shortName="out",required=true) var output : File = _ + @Argument(doc="reference file",shortName="ref",required=false) var ref : File = _ + @Argument(doc="The samples to extract",shortName="sm",required=false) var samples : String = _ + @Argument(doc="Keep filtered sites when merging or extracting?",shortName="kf",required=false) var keepFilters : Boolean = false + // todo -- additional arguments or argument collection + + def script = { + if ( qtool.equals("VCFExtractSites") ) { + runVCFExtractSites + } + + if ( qtool.equals("VCFSimpleMerge") ) { + runVCFSimpleMerge + } + + if ( qtool.equals("VCFExtractSamples") ) { + runVCFExtractSamples + } + + if ( qtool.equals("VCFExtractIntervals") ) { + runVCFExtractIntervals + } + + if ( qtool.equals("SortByRef") ) { + runSortByRef + } + } + + def runVCFExtractSites = { + var ves : VCFExtractSites = new VCFExtractSites(inVCF,output) + add(ves) + } + + def runVCFSimpleMerge = { + var vsm : VCFSimpleMerge = new VCFSimpleMerge + vsm.vcfs = inVCFs.split(",").toList.map(new File(_)) + vsm.outVCF = output + vsm.fai = new File(ref.getAbsolutePath+".fai") + + add(vsm) + } + + def runVCFExtractSamples = { + var ves : VCFExtractSamples = new VCFExtractSamples(inVCF,output,samples.split(",").toList) + add(ves) + } + + def runVCFExtractIntervals = { + var vei : VCFExtractIntervals = new VCFExtractIntervals(inVCF,output,keepFilters) + add(vei) + } + + def runSortByRef = { + var sbr : SortByRef = new SortByRef(inVCF,new File(ref.getAbsolutePath+".fai"),output) + add(sbr) + } +} \ No newline at end of file diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/SortByRef.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/SortByRef.scala index 2ffedbc31..c0d6d75c3 100755 --- a/scala/src/org/broadinstitute/sting/queue/library/ipf/SortByRef.scala +++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/SortByRef.scala @@ -14,24 +14,44 @@ class SortByRef( input: File, reference: File, output: File ) extends InProcessF @Output(doc="The file to write the sorted file to") var outFile : File = output @Argument(doc="The character or expression that separates entries") var separator : String = "\t" @Argument(doc="The position of the contig in the file (1-based)") var pos: Int = 1 - @Argument(doc="Comment characters (lines will be ignored)") var comment: List[String] = List("#") + @Argument(doc="Comment characters (lines will be brought to file head)") var comment: List[String] = List("#") - var contigMap: HashMap[String,Int] = new HashMap[String,Int]; + val COMMENT_STRING = "@#!" - def contigVal( line : String ) : Int = { - if ( comment.contains(line.charAt(0)) ) { - return -1; - } + var contigMap: List[(String,PrintWriter,File)] = Nil; + + def entryToTriplet( line : String ) : (String,PrintWriter,File) = { + val ctig : String = line.split("\t",2)(0) + val tmpf : File = File.createTempFile("sbr",".tmp") + val pw : PrintWriter = new PrintWriter(new PrintStream(tmpf)) + return (ctig,pw,tmpf) + } + + def contigVal( line : String ) : PrintWriter = { if ( contigMap.size < 1 ) { // no contigs - ( new XReadLines(fai)).readLines.map( u => u.split("\t").head).zipWithIndex.foreach( u => contigMap += u ) + contigMap :+= entryToTriplet(COMMENT_STRING+"\t.") + contigMap ++= ( new XReadLines(fai)).readLines.map( entryToTriplet(_)).toList } - return contigMap( line.split(separator)(pos-1) ) + if ( comment.contains(line.charAt(0).toString) ) { + return contigMap.find( u => u._1.equals(COMMENT_STRING)).head._2; + } + + val matches = contigMap.find( u => u._1.equals(line.split(separator)(pos-1))) + if ( matches.isEmpty ) { + System.out.println("Empty match for "+line) + return contigMap(0)._2 + } else { return matches.head._2 } } def run = { var w : PrintWriter = new PrintWriter(new PrintStream(outFile)) - ( new XReadLines(inFile) ).readLines.sortBy(contigVal).foreach( u => w.println(u) ) + System.out.println("Writing to temp files...") + ( new XReadLines(inFile) ).readLines.foreach( u => contigVal(u).println(u) ) + contigMap.foreach( u => u._2.close ) + System.out.println("Concatenating...") + contigMap.map( u => new XReadLines(u._3) ).foreach( u => asScalaIterator(u).foreach(u => w.println(u))) + w.close() } } \ No newline at end of file diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala index 2dd057cb6..49d070e75 100755 --- a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala +++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala @@ -19,6 +19,7 @@ class VCFExtractIntervals(inVCF: File, outList: File, useFilterSites: Boolean) e def run = { out = new PrintWriter(new PrintStream(listOut)) asScalaIterator(new XReadLines(vcfIn)).foreach(vcf2int) + out.close } def vcf2int( vcfLine: String ) : Unit = { diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala index b9f75b0b6..54e541142 100755 --- a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala +++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala @@ -6,7 +6,7 @@ import collection.JavaConversions._ import org.broadinstitute.sting.commandline._ import java.io.{PrintWriter, PrintStream, File} -class VCFExtractSamples(inVCF: File, outVCF: File, samples: List[String]) extends InProcessFunction { +class VCFExtractSamples(inVCF: File, outVCF: File, samples: List[String]) extends InProcessFunction { def this(in: File, out: File, samples: File) = this(in,out, (new XReadLines(samples)).readLines.toList) @Input(doc="VCF from which to extract samples") var inputVCF : File = inVCF @@ -19,6 +19,7 @@ class VCFExtractSamples(inVCF: File, outVCF: File, samples: List[String]) extend def run = { out = new PrintWriter(new PrintStream(outputVCF)) asScalaIterator(new XReadLines(inputVCF)).foreach(subset) + out.close } def subset( line : String ) { diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala index 61e7076f9..f4d64d12d 100755 --- a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala +++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSites.scala @@ -17,6 +17,7 @@ class VCFExtractSites( vcf: File, output: File) extends InProcessFunction { def lineMap( line: String ) : String = { if ( line.startsWith("##") ) { return line } val spline = line.split("\t",9) + if ( spline(0).startsWith("#")) { return spline.slice(0,8).reduceLeft( _+"\t"+_) } if ( spline(6) == "PASS" || keepFilters ) { if ( ! keepInfo ) { @@ -31,9 +32,34 @@ class VCFExtractSites( vcf: File, output: File) extends InProcessFunction { return "" } + def lineMapDebug( line: String ) : String = { + System.out.printf("Input: %s%n ",line) + val o = lineMap(line) + System.out.printf("Output: %s%n",o) + + return o + } + + def debugFilter ( line : String ) : Boolean = { + System.out.printf("Filter In: %s%n",line) + if ( line != "" ) { + System.out.printf("Not filtered %n") + return true + } else { + System.out.printf("Filtered%n") + return false + } + } + + def debugPrint(line: String, k : PrintWriter) : Unit = { + System.out.printf("Into print: %s%n",line) + k.println(line) + } + def run { var w: PrintWriter = new PrintWriter( new PrintStream(outVCF) ) - ( new XReadLines(inVCF) ).readLines().map(lineMap).view.filter( u => u != "" ).foreach( u => w.println(u) ) + asScalaIterator[String](new XReadLines(inVCF)).map(lineMap).filter( u => u != "" ).foreach( u => w.println(u) ) + w.close } } \ No newline at end of file