Small script to chunk up a VCF into equal-sized chunks
This commit is contained in:
parent
0111505ea9
commit
27ea6426a4
|
|
@ -0,0 +1,88 @@
|
||||||
|
package org.broadinstitute.sting.queue.qscripts.lib
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.queue.QScript
|
||||||
|
import org.broadinstitute.sting.queue.library.ipf.vcf.VCFExtractIntervals
|
||||||
|
import scala.collection.JavaConversions._
|
||||||
|
import org.broadinstitute.sting.utils.text.XReadLines
|
||||||
|
import java.io.PrintStream
|
||||||
|
import org.broadinstitute.sting.queue.extensions.gatk.SelectVariants
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: 2/2/12
|
||||||
|
* Time: 12:13 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
|
||||||
|
class ChunkVCF extends QScript {
|
||||||
|
|
||||||
|
@Input(shortName="V",fullName="VCF",doc="The VCF you want to chunk",required=true)
|
||||||
|
var inVCF : File = _
|
||||||
|
|
||||||
|
@Input(shortName="N",fullName="numEntriesInChunk",doc="The number of variants per chunk",required=true)
|
||||||
|
var numEntries : Int = _
|
||||||
|
|
||||||
|
@Input(shortName="I",fullName="Intervals",doc="The SNP interval list to chunk. If not provided, one will be created for you to provide in a second run.")
|
||||||
|
var intervals : File = _
|
||||||
|
|
||||||
|
@Input(fullName="preserveChromosomes",doc="Restrict chunks to one chromosome (smaller chunk at end of chromosome)",required=false)
|
||||||
|
var preserve : Boolean = false
|
||||||
|
|
||||||
|
@Input(fullName="reference",doc="The reference file",required=false)
|
||||||
|
var ref : File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
|
||||||
|
|
||||||
|
@Input(fullName="samples",doc="A file of sample IDs to condense VCF file to",required=false)
|
||||||
|
var extractSamples : File = _
|
||||||
|
|
||||||
|
val tmpdir : File = System.getProperty("java.io.tmpdir")
|
||||||
|
|
||||||
|
def script = {
|
||||||
|
if ( intervals == null ) {
|
||||||
|
// create an interval list from the VCF
|
||||||
|
val ivals : File = swapExt(variants,".vcf",".intervals.list")
|
||||||
|
val extract : VCFExtractIntervals = new VCFExtractIntervals(variants,ivals,false)
|
||||||
|
add(extract)
|
||||||
|
} else {
|
||||||
|
var chunkNum = 1
|
||||||
|
var numLinesInChunk = 0
|
||||||
|
var chromosome : String = asScalaIterator(new XReadLines(intervals)).next().split(":")(0)
|
||||||
|
var chunkFile : File = new File(tmpdir,"ChunkVCF.chunk%d.intervals.list".format(chunkNum))
|
||||||
|
var chunkWriter = new PrintStream(chunkFile)
|
||||||
|
asScalaIterator(new XReadLines(intervals)).foreach( int => {
|
||||||
|
// check new chromosome or full chunk
|
||||||
|
if ( ( preserve && ! int.split(":")(0).equals(chromosome) ) || numLinesInChunk > numEntries ) {
|
||||||
|
chunkWriter.close()
|
||||||
|
val chunkSelect : SelectVariants = new SelectVariants
|
||||||
|
chunkSelect.reference_sequence = ref
|
||||||
|
chunkSelect.memoryLimit = 2
|
||||||
|
chunkSelect.intervals :+= chunkFile
|
||||||
|
if ( extractSamples != null )
|
||||||
|
chunkSelect.sample_file = extractSamples
|
||||||
|
chunkSelect.out = swapExt(inVCF,".vcf",".chunk%d.vcf".format(chunkNum))
|
||||||
|
add(chunkSelect)
|
||||||
|
chunkNum += 1
|
||||||
|
numLinesInChunk = 0
|
||||||
|
chromosome = int.split(":")(0)
|
||||||
|
chunkFile = new File(tmpdir,"ChunkVCF.chunk%d.intervals.list".format(chunkNum))
|
||||||
|
chunkWriter = new PrintStream(chunkFile)
|
||||||
|
}
|
||||||
|
chunkWriter.printf("%s%n",int)
|
||||||
|
numLinesInChunk += 1
|
||||||
|
})
|
||||||
|
// last chunk
|
||||||
|
if ( numLinesInChunk > 0 ) {
|
||||||
|
// some work to do
|
||||||
|
val chunkSelect : SelectVariants = new SelectVariants
|
||||||
|
chunkSelect.reference_sequence = ref
|
||||||
|
chunkSelect.memoryLimit = 2
|
||||||
|
chunkSelect.intervals :+= chunkFile
|
||||||
|
chunkWriter.close()
|
||||||
|
if ( extractSamples != null )
|
||||||
|
chunkSelect.sample_file = extractSamples
|
||||||
|
chunkSelect.out = swapExt(inVCF,".vcf",".chunk%d.vcf".format(chunkNum))
|
||||||
|
add(chunkSelect)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue