From 9515f942428516ba508368178edf1408298a1ef4 Mon Sep 17 00:00:00 2001 From: chartl Date: Thu, 3 Feb 2011 05:01:21 +0000 Subject: [PATCH] Commiting a simple merge IPF for use with qscripts (currently use a long grep, awk, pipe command, which can be unsafe and is hard to extend). Tests for all these functions coming soon. Also, IntelliJ + intermittent VPN connection = botched repository. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5179 348d0f76-0448-11de-a6fe-93d51630548a --- .../library/ipf/vcf/VCFSimpleMerge.scala | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100755 scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala diff --git a/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala new file mode 100755 index 000000000..c82f3ad96 --- /dev/null +++ b/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFSimpleMerge.scala @@ -0,0 +1,78 @@ +package org.broadinstitute.sting.queue.library.ipf.vcf + +import collection.JavaConversions._ +import org.broadinstitute.sting.queue.function.InProcessFunction +import org.broadinstitute.sting.commandline._ +import org.broadinstitute.sting.utils.text.XReadLines +import java.io.{PrintStream, PrintWriter, File} +import net.sf.samtools.{SAMSequenceRecord, SAMSequenceDictionary} +import org.broadinstitute.sting.utils.{GenomeLocParser, GenomeLoc} + +class VCFSimpleMerge extends InProcessFunction { + @Input(doc="VCFs to be merged") var vcfs: List[File] = Nil + @Input(doc="The reference fasta index") var fasta: File = _ + @Output(doc="The final VCF to write to") var outVCF : File = _ + + class PeekableXRL(f : File ) { + var xrl : XReadLines = new XReadLines(f) + var cur : String = if ( xrl.hasNext ) xrl.next else null + + def hasNext : Boolean = xrl.hasNext || cur != null + def next : String = { + var toRet : String = cur + if ( xrl.hasNext ) { + cur = xrl.next + } else { + cur = null + } + + return toRet + } + + def peek : String = cur + + } + + def readHeader( xrl : PeekableXRL ) : List[String] = { + var toRet : List[String] = Nil + while ( xrl.hasNext && xrl.peek.startsWith("#") ) { + toRet :+= xrl.next + } + + return toRet + } + + def genomeLoc(xrl : PeekableXRL, p : GenomeLocParser ) : GenomeLoc = { + var slp = xrl.peek.split("\t",3) + return p.parseGenomeLoc(slp(0),Integer.parseInt(slp(1)),Integer.parseInt(slp(1))) + } + + def run = { + var ssd : SAMSequenceDictionary = new SAMSequenceDictionary + for ( line <- (new XReadLines(fasta)).readLines ) { + val spl = line.split("\\s+") + val ctig = spl(0) + val pos = Integer.parseInt(spl(1)) + ssd.addSequence(new SAMSequenceRecord(ctig,pos)) + } + + var xrls : List[PeekableXRL] = vcfs.map( new PeekableXRL(_) ) + + var w : PrintWriter = new PrintWriter(new PrintStream(outVCF)) + + readHeader(xrls(0)).foreach(u => w.println(u) ) + + xrls.foreach(readHeader(_)) + + val glp : GenomeLocParser = new GenomeLocParser(ssd) + + while ( ! xrls.forall( u => ! u.hasNext ) ) { + val first = xrls.filter( u => u.hasNext).reduceLeft( (a,b) => if ( genomeLoc(a,glp).isBefore(genomeLoc(b,glp))) a else b ) + w.println(first.next) + } + + w.close + + } + +} \ No newline at end of file