Two intermediate commits, to work over the weekend.
ReplicationValidationWalker: Just the skeleton of what will be the implementation of the replication/validation model. dataProcessingV2: Committing an UNTESTED implementation of BWA alignment. I am running tests on it over the weekend. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5900 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
69d9b5989f
commit
5974675b43
|
|
@ -0,0 +1,57 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.replication_validation;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Implementation of the replication and validation framework with reference based error model
|
||||
* for pooled sequencing.
|
||||
*
|
||||
* The input should be a BAM file with pooled sequencing data where each pool is represented by
|
||||
* samples with the same barcode.
|
||||
*
|
||||
* A reference sample name must be provided and it must be barcoded uniquely.
|
||||
*/
|
||||
public class ReplicationValidationWalker extends LocusWalker<Integer, Long> implements TreeReducible<Long> {
|
||||
@Argument(shortName="refSample", fullName="reference_sample_name", doc="reference sample name", required=true)
|
||||
String referenceSampleName = "NA12878";
|
||||
|
||||
@Output(doc="Write output to this file instead of STDOUT")
|
||||
PrintStream out;
|
||||
|
||||
public void initialize() {
|
||||
return;
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
// todo -- Locate the reference samples for each lane
|
||||
// todo -- Locate each pool and assign a reference sample to it
|
||||
return 1;
|
||||
}
|
||||
|
||||
public Long reduceInit() { return 0l; }
|
||||
|
||||
public Long reduce(Integer value, Long sum) {
|
||||
return value + sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduces two subtrees together. In this case, the implementation of the tree reduce
|
||||
* is exactly the same as the implementation of the single reduce.
|
||||
*/
|
||||
public Long treeReduce(Long lhs, Long rhs) {
|
||||
return lhs + rhs;
|
||||
}
|
||||
|
||||
public void onTraversalDone( Long c ) {
|
||||
out.println(c);
|
||||
}
|
||||
}
|
||||
|
|
@ -10,6 +10,7 @@ import net.sf.samtools.{SAMFileReader,SAMReadGroupRecord}
|
|||
import scala.io.Source._
|
||||
import collection.JavaConversions._
|
||||
import org.broadinstitute.sting.queue.function.scattergather.ScatterFunction
|
||||
import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter
|
||||
|
||||
|
||||
class dataProcessingV2 extends QScript {
|
||||
|
|
@ -60,6 +61,9 @@ class dataProcessingV2 extends QScript {
|
|||
@Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false)
|
||||
var bwaPath: File = _
|
||||
|
||||
@Input(doc="The path to the binary of samtools (required if using BWA)", fullName="path_to_sam", shortName="sam", required=false)
|
||||
var samPath: File = _
|
||||
|
||||
@Input(doc="dbsnp ROD to use (must be in VCF format)", fullName="dbsnp", shortName="D", required=false)
|
||||
var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
|
||||
|
||||
|
|
@ -111,8 +115,8 @@ class dataProcessingV2 extends QScript {
|
|||
var sample: String = ""
|
||||
for (r <- readGroups) {
|
||||
if (sample.isEmpty)
|
||||
sample = r.getSample()
|
||||
else if (sample != r.getSample())
|
||||
sample = r.getSample
|
||||
else if (sample != r.getSample)
|
||||
return true;
|
||||
}
|
||||
return false
|
||||
|
|
@ -126,8 +130,8 @@ class dataProcessingV2 extends QScript {
|
|||
val sampleTable = scala.collection.mutable.Map.empty[String, List[File]]
|
||||
for (bam <- bamFiles) {
|
||||
val samReader = new SAMFileReader(bam)
|
||||
val header = samReader.getFileHeader()
|
||||
val readGroups = header.getReadGroups()
|
||||
val header = samReader.getFileHeader
|
||||
val readGroups = header.getReadGroups
|
||||
|
||||
// only allow one sample per file. Bam files with multiple samples would require pre-processing of the file
|
||||
// with PrintReads to separate the samples. Tell user to do it himself!
|
||||
|
|
@ -135,7 +139,7 @@ class dataProcessingV2 extends QScript {
|
|||
|
||||
// Fill out the sample table with the readgroups in this file
|
||||
for (rg <- readGroups) {
|
||||
val sample = rg.getSample()
|
||||
val sample = rg.getSample
|
||||
if (!sampleTable.contains(sample))
|
||||
sampleTable(sample) = List(bam)
|
||||
else if ( !sampleTable(sample).contains(bam))
|
||||
|
|
@ -160,10 +164,46 @@ class dataProcessingV2 extends QScript {
|
|||
}
|
||||
|
||||
|
||||
// Takes a list of processed BAM files, revert them to unprocessed and realigns each lane, producing a list of
|
||||
// per-lane aligned bam files, ready to be processed.
|
||||
def performAlignment(bamList: File): List[File] = {
|
||||
return List()
|
||||
def buildReadGroupString(samReader: SAMFileReader): String = {
|
||||
val readGroups = samReader.getFileHeader.getReadGroups
|
||||
var s = ""
|
||||
for (rg <- readGroups) {
|
||||
if (s != "") {
|
||||
s = s + "\n"
|
||||
}
|
||||
s = s + "@RG\t" +
|
||||
SAMReadGroupRecord.READ_GROUP_ID_TAG + ":" + rg.getReadGroupId + "\t" +
|
||||
SAMReadGroupRecord.PLATFORM_TAG + ":" + rg.getPlatformUnit + "\t" +
|
||||
SAMReadGroupRecord.LIBRARY_TAG + ":" + rg.getLibrary + "\t" +
|
||||
SAMReadGroupRecord.READ_GROUP_SAMPLE_TAG + ":" + rg.getSample + "\t" +
|
||||
SAMReadGroupRecord.SEQUENCING_CENTER_TAG + ":" + rg.getSequencingCenter + "\t"
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
// Takes a list of processed BAM files and realign them using the BWA option requested (bwase or bwape).
|
||||
// Returns a list of realigned BAM files.
|
||||
def performAlignment(bamMap: Map[String, File]) {
|
||||
for (key <- bamMap.keysIterator) {
|
||||
val bam: File = bamMap(key)
|
||||
val saiFile1 = swapExt(bam, ".bam", "1.sai")
|
||||
val saiFile2 = swapExt(bam, ".bam", "2.sai")
|
||||
val realignedSamFile = swapExt(bam, ".bam", ".realigned.sam")
|
||||
val realignedBamFile = swapExt(bam, ".bam", ".realigned.bam")
|
||||
val readGroupString: String = buildReadGroupString(new SAMFileReader())
|
||||
if (useBWAse) {
|
||||
add(bwa_aln_se(bam, saiFile1),
|
||||
bwa_sam_se(bam, saiFile1, realignedSamFile, readGroupString),
|
||||
to_bam(realignedSamFile, realignedBamFile))
|
||||
}
|
||||
else if (useBWApe) {
|
||||
add(bwa_aln_pe(bam, saiFile1),
|
||||
bwa_aln_pe(bam, saiFile2),
|
||||
bwa_sam_pe(bam, saiFile1, saiFile2, realignedSamFile, readGroupString),
|
||||
to_bam(realignedSamFile, realignedBamFile))
|
||||
}
|
||||
bamMap(key) = realignedBamFile
|
||||
}
|
||||
}
|
||||
|
||||
def createListFromFile(in: File):List[File] = {
|
||||
|
|
@ -184,14 +224,18 @@ class dataProcessingV2 extends QScript {
|
|||
|
||||
def script = {
|
||||
|
||||
//todo -- (option - BWA) run BWA on each bam file (per lane bam file) before performing per sample processing
|
||||
val perLaneAlignedBamFiles: List[File] = if (useBWApe || useBWAse) { performAlignment(input) } else { createListFromFile(input) }
|
||||
// keep a record of the number of contigs in the first bam file in the list
|
||||
val bams = createListFromFile(input)
|
||||
nContigs = getNumberOfContigs(bams(0))
|
||||
|
||||
// Generate a BAM file per sample joining all per lane files if necessary
|
||||
val sampleBamFiles = createSampleFiles(perLaneAlignedBamFiles)
|
||||
val sampleBamFiles: Map[String, File] = createSampleFiles(bams)
|
||||
|
||||
//todo -- (option - BWA) run BWA on each bam file (per lane bam file) before performing per sample processing
|
||||
if (useBWApe || useBWAse) {
|
||||
performAlignment(sampleBamFiles)
|
||||
}
|
||||
|
||||
// keep a record of the number of contigs in the first bam file in the list
|
||||
nContigs = getNumberOfContigs(perLaneAlignedBamFiles(0))
|
||||
|
||||
println("nContigs: " + nContigs)
|
||||
|
||||
|
|
@ -408,7 +452,7 @@ class dataProcessingV2 extends QScript {
|
|||
this.jobName = queueLogDir + outSai + ".bwa_aln_se"
|
||||
}
|
||||
|
||||
case class bwa_aln_pe1 (inBam: File, outSai1: File) extends CommandLineFunction {
|
||||
case class bwa_aln_pe (inBam: File, outSai1: File) extends CommandLineFunction {
|
||||
@Input(doc="bam file to be aligned") var bam = inBam
|
||||
@Output(doc="output sai file for 1st mating pair") var sai = outSai1
|
||||
def commandLine = bwaPath + " aln -q 5 " + reference + " -b1 " + bam + " > " + sai
|
||||
|
|
@ -417,15 +461,6 @@ class dataProcessingV2 extends QScript {
|
|||
this.jobName = queueLogDir + outSai1 + ".bwa_aln_pe1"
|
||||
}
|
||||
|
||||
case class bwa_aln_pe2 (inBam: File, outSai2: File) extends CommandLineFunction {
|
||||
@Input(doc="bam file to be aligned") var bam = inBam
|
||||
@Output(doc="output sai file for 2nd mating pair") var sai = outSai2
|
||||
def commandLine = bwaPath + " aln -q 5 " + reference + " -b2 " + bam + " > " + sai
|
||||
this.isIntermediate = true
|
||||
this.analysisName = queueLogDir + outSai2 + ".bwa_aln_pe2"
|
||||
this.jobName = queueLogDir + outSai2 + ".bwa_aln_pe2"
|
||||
}
|
||||
|
||||
case class bwa_sam_se (inBam: File, inSai: File, outBam: File, readGroup: String) extends CommandLineFunction {
|
||||
@Input(doc="bam file to be aligned") var bam = inBam
|
||||
@Input(doc="bwa alignment index file") var sai = inSai
|
||||
|
|
@ -447,6 +482,15 @@ class dataProcessingV2 extends QScript {
|
|||
this.jobName = queueLogDir + outBam + ".bwa_sam_pe"
|
||||
}
|
||||
|
||||
case class to_bam (inSam: File, outBam: File) extends CommandLineFunction {
|
||||
@Input(doc="sam file to become bam") var sam = inSam
|
||||
@Output(doc="bam file") var bam = outBam
|
||||
def commandLine = samPath + " view -b " + sam + " > " + bam
|
||||
this.isIntermediate = true
|
||||
this.analysisName = queueLogDir + outBam + ".to_bam"
|
||||
this.jobName = queueLogDir + outBam + ".to_bam"
|
||||
}
|
||||
|
||||
case class writeList(inBams: List[File], outBamList: File) extends ListWriterFunction {
|
||||
this.inputFiles = inBams
|
||||
this.listFile = outBamList
|
||||
|
|
|
|||
Loading…
Reference in New Issue