Now uses tagging of -I arguments. Multiple -I options (merging) is now allowed. In somatic mode 'tumor' and 'normal' tags are required for each input bam, the order does not matter anymore (since we use tags!)
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4286 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e5f81d25d4
commit
d7b5baf8e5
|
|
@ -31,10 +31,7 @@ import org.broad.tribble.util.variantcontext.Allele;
|
||||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
import org.broad.tribble.util.variantcontext.Genotype;
|
import org.broad.tribble.util.variantcontext.Genotype;
|
||||||
import org.broad.tribble.vcf.*;
|
import org.broad.tribble.vcf.*;
|
||||||
import org.broadinstitute.sting.gatk.filters.Platform454Filter;
|
import org.broadinstitute.sting.gatk.filters.*;
|
||||||
import org.broadinstitute.sting.gatk.filters.PlatformUnitFilter;
|
|
||||||
import org.broadinstitute.sting.gatk.filters.PlatformUnitFilterHelper;
|
|
||||||
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.*;
|
import org.broadinstitute.sting.gatk.refdata.*;
|
||||||
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec;
|
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec;
|
||||||
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature;
|
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature;
|
||||||
|
|
@ -46,8 +43,10 @@ import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
||||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSource;
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSource;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID;
|
||||||
import org.broadinstitute.sting.utils.*;
|
import org.broadinstitute.sting.utils.*;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||||
import org.broadinstitute.sting.utils.collections.CircularArray;
|
import org.broadinstitute.sting.utils.collections.CircularArray;
|
||||||
import org.broadinstitute.sting.utils.collections.PrimitivePair;
|
import org.broadinstitute.sting.utils.collections.PrimitivePair;
|
||||||
|
|
@ -148,8 +147,8 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
private LocationAwareSeekableRODIterator refseqIterator=null;
|
private LocationAwareSeekableRODIterator refseqIterator=null;
|
||||||
|
|
||||||
private Set<String> normalReadGroups; // we are going to remember which read groups are normals and which are tumors in order to be able
|
// private Set<String> normalReadGroups; // we are going to remember which read groups are normals and which are tumors in order to be able
|
||||||
private Set<String> tumorReadGroups ; // to properly assign the reads coming from a merged stream
|
// private Set<String> tumorReadGroups ; // to properly assign the reads coming from a merged stream
|
||||||
private Set<String> normalSamples; // we are going to remember which samples are normal and which are tumor:
|
private Set<String> normalSamples; // we are going to remember which samples are normal and which are tumor:
|
||||||
private Set<String> tumorSamples ; // these are used only to generate genotypes for vcf output
|
private Set<String> tumorSamples ; // these are used only to generate genotypes for vcf output
|
||||||
|
|
||||||
|
|
@ -206,7 +205,9 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
|
|
||||||
normal_context = new WindowContext(0,WINDOW_SIZE);
|
normal_context = new WindowContext(0,WINDOW_SIZE);
|
||||||
|
normalSamples = new HashSet<String>();
|
||||||
|
|
||||||
if ( bedOutput != null && output_file != null ) {
|
if ( bedOutput != null && output_file != null ) {
|
||||||
throw new UserException.DeprecatedArgument("-O", "-O option is deprecated and -bed option replaces it; you can not use both at the same time");
|
throw new UserException.DeprecatedArgument("-O", "-O option is deprecated and -bed option replaces it; you can not use both at the same time");
|
||||||
|
|
@ -229,33 +230,77 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
int nSams = getToolkit().getArguments().samFiles.size();
|
int nSams = getToolkit().getArguments().samFiles.size();
|
||||||
|
|
||||||
|
if ( call_somatic ) {
|
||||||
|
if ( nSams < 2 ) throw new UserException.BadInput("At least two bam files (normal and tumor) must be specified in somatic mode");
|
||||||
|
tumor_context = new WindowContext(0,WINDOW_SIZE);
|
||||||
|
tumorSamples = new HashSet<String>();
|
||||||
|
}
|
||||||
|
|
||||||
|
int nNorm = 0;
|
||||||
|
int nTum = 0;
|
||||||
|
for ( SAMReaderID rid : getToolkit().getDataSource().getReaderIDs() ) {
|
||||||
|
List<String> tags = rid.getTags() ;
|
||||||
|
if ( tags.isEmpty() && call_somatic )
|
||||||
|
throw new UserException.BadInput("In somatic mode all input bam files must be tagged as either 'normal' or 'tumor'. Untagged file: "+
|
||||||
|
getToolkit().getSourceFileForReaderID(rid));
|
||||||
|
boolean normal = false;
|
||||||
|
boolean tumor = false;
|
||||||
|
for ( String s : tags ) { // we allow additional unrelated tags (and we do not use them), but we REQUIRE one of Tumor/Normal to be present if --somatic is on
|
||||||
|
if ( "NORMAL".equals(s.toUpperCase()) ) {
|
||||||
|
normal = true;
|
||||||
|
nNorm++;
|
||||||
|
}
|
||||||
|
if ( "TUMOR".equals(s.toUpperCase()) ) {
|
||||||
|
tumor = true;
|
||||||
|
nTum++ ;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( call_somatic && normal && tumor ) throw new UserException.BadInput("Input bam file "+
|
||||||
|
getToolkit().getSourceFileForReaderID(rid)+" is tagged both as normal and as tumor. Which one is it??");
|
||||||
|
if ( call_somatic && !normal && ! tumor )
|
||||||
|
throw new UserException.BadInput("In somatic mode all input bams must be tagged as either normal or tumor. Encountered untagged file: "+
|
||||||
|
getToolkit().getSourceFileForReaderID(rid));
|
||||||
|
if ( ! call_somatic && (normal || tumor) )
|
||||||
|
System.out.println("WARNING: input bam file "+getToolkit().getSourceFileForReaderID(rid)
|
||||||
|
+" is tagged as Normal and/or Tumor, but somatic mode is not on. Tags will ne IGNORED");
|
||||||
|
if ( call_somatic && tumor ) {
|
||||||
|
for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader(rid).getReadGroups() ) {
|
||||||
|
tumorSamples.add(rg.getSample());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader(rid).getReadGroups() ) {
|
||||||
|
normalSamples.add(rg.getSample());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
location = GenomeLocParser.createGenomeLoc(0,1);
|
location = GenomeLocParser.createGenomeLoc(0,1);
|
||||||
|
|
||||||
List<Set<String>> readGroupSets = getToolkit().getMergedReadGroupsByReaders();
|
// List<Set<String>> readGroupSets = getToolkit().getMergedReadGroupsByReaders();
|
||||||
List<Set<String>> sampleSets = getToolkit().getSamplesByReaders();
|
// List<Set<String>> sampleSets = getToolkit().getSamplesByReaders();
|
||||||
|
|
||||||
normalSamples = sampleSets.get(0);
|
normalSamples = getToolkit().getSamplesByReaders().get(0);
|
||||||
|
|
||||||
if ( call_somatic ) {
|
// if ( call_somatic ) {
|
||||||
if ( nSams != 2 ) {
|
// if ( nSams != 2 ) {
|
||||||
System.out.println("In --somatic mode two input bam files must be specified (normal/tumor)");
|
// System.out.println("In --somatic mode two input bam files must be specified (normal/tumor)");
|
||||||
System.exit(1);
|
// System.exit(1);
|
||||||
}
|
// }
|
||||||
tumor_context = new WindowContext(0,WINDOW_SIZE);
|
|
||||||
|
|
||||||
normalReadGroups = readGroupSets.get(0); // first -I option must specify normal.bam
|
// normalReadGroups = readGroupSets.get(0); // first -I option must specify normal.bam
|
||||||
System.out.println(normalReadGroups.size() + " normal read groups");
|
// System.out.println(normalReadGroups.size() + " normal read groups");
|
||||||
//for ( String rg : normalReadGroups ) System.out.println("Normal RG: "+rg);
|
// for ( String rg : normalReadGroups ) System.out.println("Normal RG: "+rg);
|
||||||
|
|
||||||
tumorReadGroups = readGroupSets.get(1); // second -I option must specify tumor.bam
|
// tumorReadGroups = readGroupSets.get(1); // second -I option must specify tumor.bam
|
||||||
System.out.println(tumorReadGroups.size() + " tumor read groups");
|
// System.out.println(tumorReadGroups.size() + " tumor read groups");
|
||||||
// for ( String rg : tumorReadGroups ) System.out.println("Tumor RG: "+rg);
|
// for ( String rg : tumorReadGroups ) System.out.println("Tumor RG: "+rg);
|
||||||
|
|
||||||
tumorSamples = sampleSets.get(1);
|
// tumorSamples = sampleSets.get(1);
|
||||||
} else {
|
// } else {
|
||||||
if ( nSams != 1 ) System.out.println("WARNING: multiple input files specified. \n"+
|
// if ( nSams != 1 ) System.out.println("WARNING: multiple input files specified. \n"+
|
||||||
"WARNING: Without --somatic option they will be merged and processed as a single sample");
|
// "WARNING: Without --somatic option they will be merged and processed as a single sample");
|
||||||
}
|
// }
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// we already checked that bedOutput and output_file are not set simultaneously
|
// we already checked that bedOutput and output_file are not set simultaneously
|
||||||
|
|
@ -390,17 +435,36 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
if ( call_somatic ) {
|
if ( call_somatic ) {
|
||||||
|
|
||||||
String rg = (String)read.getAttribute("RG");
|
List<String> tags = getToolkit().getReaderIDForRead(read).getTags();
|
||||||
if ( rg == null )
|
boolean assigned = false;
|
||||||
throw new UserException.MalformedBam(read, "Read "+read.getReadName()+" has no read group in merged stream. RG is required for somatic calls.");
|
for ( String s : tags ) {
|
||||||
|
if ( "NORMAL".equals(s.toUpperCase()) ) {
|
||||||
if ( normalReadGroups.contains(rg) ) {
|
normal_context.add(read,ref.getBases());
|
||||||
normal_context.add(read,ref.getBases());
|
assigned = true;
|
||||||
} else if ( tumorReadGroups.contains(rg) ) {
|
break;
|
||||||
tumor_context.add(read,ref.getBases());
|
}
|
||||||
} else {
|
if ( "TUMOR".equals(s.toUpperCase()) ) {
|
||||||
throw new UserException.MalformedBam(read, "Unrecognized read group in merged stream: "+rg);
|
tumor_context.add(read,ref.getBases());
|
||||||
|
assigned = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
if ( ! assigned )
|
||||||
|
throw new StingException("Read "+read.getReadName()+" from "+getToolkit().getSourceFileForReaderID(getToolkit().getReaderIDForRead(read))+
|
||||||
|
"has no Normal/Tumor tag associated with it");
|
||||||
|
|
||||||
|
// String rg = (String)read.getAttribute("RG");
|
||||||
|
// if ( rg == null )
|
||||||
|
// throw new UserException.MalformedBam(read, "Read "+read.getReadName()+" has no read group in merged stream. RG is required for somatic calls.");
|
||||||
|
|
||||||
|
// if ( normalReadGroups.contains(rg) ) {
|
||||||
|
// normal_context.add(read,ref.getBases());
|
||||||
|
// } else if ( tumorReadGroups.contains(rg) ) {
|
||||||
|
// tumor_context.add(read,ref.getBases());
|
||||||
|
// } else {
|
||||||
|
// throw new UserException.MalformedBam(read, "Unrecognized read group in merged stream: "+rg);
|
||||||
|
// }
|
||||||
|
|
||||||
if ( tumor_context.getReads().size() > MAX_READ_NUMBER ) {
|
if ( tumor_context.getReads().size() > MAX_READ_NUMBER ) {
|
||||||
System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+
|
System.out.println("WARNING: a count of "+MAX_READ_NUMBER+" reads reached in a window "+
|
||||||
refName+':'+tumor_context.getStart()+'-'+tumor_context.getStop()+" in tumor sample. The whole window will be dropped.");
|
refName+':'+tumor_context.getStart()+'-'+tumor_context.getStop()+" in tumor sample. The whole window will be dropped.");
|
||||||
|
|
@ -592,6 +656,8 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
}
|
}
|
||||||
long move_to = adjustedPosition;
|
long move_to = adjustedPosition;
|
||||||
|
|
||||||
|
if ( DEBUG ) System.out.println("DEBUG>> Emitting in somatic mode up to "+position+" force shift="+force+" current window="+tumor_context.getStart()+"-"+tumor_context.getStop());
|
||||||
|
|
||||||
for ( long pos = tumor_context.getStart() ; pos < Math.min(adjustedPosition,tumor_context.getStop()+1) ; pos++ ) {
|
for ( long pos = tumor_context.getStart() ; pos < Math.min(adjustedPosition,tumor_context.getStop()+1) ; pos++ ) {
|
||||||
|
|
||||||
if ( tumor_context.indelsAt(pos).size() == 0 ) continue; // no indels in tumor
|
if ( tumor_context.indelsAt(pos).size() == 0 ) continue; // no indels in tumor
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue