Bug fixes in the in process function (spoiled by python: was not closing my writers). SortByRef now works somewhat like the perl script does, rather than doing a memory-expensive sort. Adding a QTools qscript which is kinda clunky, and will be used mostly for integration tests of these IPFs, pending some better way to construct argument collections and function accessors at compile-time.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5182 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2011-02-03 17:32:46 +00:00
parent 9ddc95c833
commit 5398cf620a
5 changed files with 127 additions and 11 deletions

View File

@ -0,0 +1,68 @@
import org.broadinstitute.sting.queue.library.ipf.vcf.{VCFExtractIntervals, VCFExtractSamples, VCFSimpleMerge, VCFExtractSites}
import org.broadinstitute.sting.queue.library.ipf.SortByRef
import org.broadinstitute.sting.queue.QScript
import collection.JavaConversions._
// todo -- should the argument collection on which this runs be generated at compile-time into extensions??
// todo -- maybe a compile-time generated enum of available library functions? (ipf of course)
class QTools extends QScript {
@Argument(doc="Tool to run",shortName="T", required=true) var qtool : String = _
@Argument(doc="input VCF",shortName="ivcf",required=false) var inVCF : File = _
@Argument(doc="input VCF files",shortName="vcfs",required=false) var inVCFs : String = _
@Argument(doc="output file",shortName="out",required=true) var output : File = _
@Argument(doc="reference file",shortName="ref",required=false) var ref : File = _
@Argument(doc="The samples to extract",shortName="sm",required=false) var samples : String = _
@Argument(doc="Keep filtered sites when merging or extracting?",shortName="kf",required=false) var keepFilters : Boolean = false
// todo -- additional arguments or argument collection
def script = {
if ( qtool.equals("VCFExtractSites") ) {
runVCFExtractSites
}
if ( qtool.equals("VCFSimpleMerge") ) {
runVCFSimpleMerge
}
if ( qtool.equals("VCFExtractSamples") ) {
runVCFExtractSamples
}
if ( qtool.equals("VCFExtractIntervals") ) {
runVCFExtractIntervals
}
if ( qtool.equals("SortByRef") ) {
runSortByRef
}
}
def runVCFExtractSites = {
var ves : VCFExtractSites = new VCFExtractSites(inVCF,output)
add(ves)
}
def runVCFSimpleMerge = {
var vsm : VCFSimpleMerge = new VCFSimpleMerge
vsm.vcfs = inVCFs.split(",").toList.map(new File(_))
vsm.outVCF = output
vsm.fai = new File(ref.getAbsolutePath+".fai")
add(vsm)
}
def runVCFExtractSamples = {
var ves : VCFExtractSamples = new VCFExtractSamples(inVCF,output,samples.split(",").toList)
add(ves)
}
def runVCFExtractIntervals = {
var vei : VCFExtractIntervals = new VCFExtractIntervals(inVCF,output,keepFilters)
add(vei)
}
def runSortByRef = {
var sbr : SortByRef = new SortByRef(inVCF,new File(ref.getAbsolutePath+".fai"),output)
add(sbr)
}
}

View File

@ -14,24 +14,44 @@ class SortByRef( input: File, reference: File, output: File ) extends InProcessF
@Output(doc="The file to write the sorted file to") var outFile : File = output
@Argument(doc="The character or expression that separates entries") var separator : String = "\t"
@Argument(doc="The position of the contig in the file (1-based)") var pos: Int = 1
@Argument(doc="Comment characters (lines will be ignored)") var comment: List[String] = List("#")
@Argument(doc="Comment characters (lines will be brought to file head)") var comment: List[String] = List("#")
var contigMap: HashMap[String,Int] = new HashMap[String,Int];
val COMMENT_STRING = "@#!"
def contigVal( line : String ) : Int = {
if ( comment.contains(line.charAt(0)) ) {
return -1;
}
var contigMap: List[(String,PrintWriter,File)] = Nil;
def entryToTriplet( line : String ) : (String,PrintWriter,File) = {
val ctig : String = line.split("\t",2)(0)
val tmpf : File = File.createTempFile("sbr",".tmp")
val pw : PrintWriter = new PrintWriter(new PrintStream(tmpf))
return (ctig,pw,tmpf)
}
def contigVal( line : String ) : PrintWriter = {
if ( contigMap.size < 1 ) { // no contigs
( new XReadLines(fai)).readLines.map( u => u.split("\t").head).zipWithIndex.foreach( u => contigMap += u )
contigMap :+= entryToTriplet(COMMENT_STRING+"\t.")
contigMap ++= ( new XReadLines(fai)).readLines.map( entryToTriplet(_)).toList
}
return contigMap( line.split(separator)(pos-1) )
if ( comment.contains(line.charAt(0).toString) ) {
return contigMap.find( u => u._1.equals(COMMENT_STRING)).head._2;
}
val matches = contigMap.find( u => u._1.equals(line.split(separator)(pos-1)))
if ( matches.isEmpty ) {
System.out.println("Empty match for "+line)
return contigMap(0)._2
} else { return matches.head._2 }
}
def run = {
var w : PrintWriter = new PrintWriter(new PrintStream(outFile))
( new XReadLines(inFile) ).readLines.sortBy(contigVal).foreach( u => w.println(u) )
System.out.println("Writing to temp files...")
( new XReadLines(inFile) ).readLines.foreach( u => contigVal(u).println(u) )
contigMap.foreach( u => u._2.close )
System.out.println("Concatenating...")
contigMap.map( u => new XReadLines(u._3) ).foreach( u => asScalaIterator(u).foreach(u => w.println(u)))
w.close()
}
}

View File

@ -19,6 +19,7 @@ class VCFExtractIntervals(inVCF: File, outList: File, useFilterSites: Boolean) e
def run = {
out = new PrintWriter(new PrintStream(listOut))
asScalaIterator(new XReadLines(vcfIn)).foreach(vcf2int)
out.close
}
def vcf2int( vcfLine: String ) : Unit = {

View File

@ -6,7 +6,7 @@ import collection.JavaConversions._
import org.broadinstitute.sting.commandline._
import java.io.{PrintWriter, PrintStream, File}
class VCFExtractSamples(inVCF: File, outVCF: File, samples: List[String]) extends InProcessFunction {
class VCFExtractSamples(inVCF: File, outVCF: File, samples: List[String]) extends InProcessFunction {
def this(in: File, out: File, samples: File) = this(in,out, (new XReadLines(samples)).readLines.toList)
@Input(doc="VCF from which to extract samples") var inputVCF : File = inVCF
@ -19,6 +19,7 @@ class VCFExtractSamples(inVCF: File, outVCF: File, samples: List[String]) extend
def run = {
out = new PrintWriter(new PrintStream(outputVCF))
asScalaIterator(new XReadLines(inputVCF)).foreach(subset)
out.close
}
def subset( line : String ) {

View File

@ -17,6 +17,7 @@ class VCFExtractSites( vcf: File, output: File) extends InProcessFunction {
def lineMap( line: String ) : String = {
if ( line.startsWith("##") ) { return line }
val spline = line.split("\t",9)
if ( spline(0).startsWith("#")) { return spline.slice(0,8).reduceLeft( _+"\t"+_) }
if ( spline(6) == "PASS" || keepFilters ) {
if ( ! keepInfo ) {
@ -31,9 +32,34 @@ class VCFExtractSites( vcf: File, output: File) extends InProcessFunction {
return ""
}
def lineMapDebug( line: String ) : String = {
System.out.printf("Input: %s%n ",line)
val o = lineMap(line)
System.out.printf("Output: %s%n",o)
return o
}
def debugFilter ( line : String ) : Boolean = {
System.out.printf("Filter In: %s%n",line)
if ( line != "" ) {
System.out.printf("Not filtered %n")
return true
} else {
System.out.printf("Filtered%n")
return false
}
}
def debugPrint(line: String, k : PrintWriter) : Unit = {
System.out.printf("Into print: %s%n",line)
k.println(line)
}
def run {
var w: PrintWriter = new PrintWriter( new PrintStream(outVCF) )
( new XReadLines(inVCF) ).readLines().map(lineMap).view.filter( u => u != "" ).foreach( u => w.println(u) )
asScalaIterator[String](new XReadLines(inVCF)).map(lineMap).filter( u => u != "" ).foreach( u => w.println(u) )
w.close
}
}