now has two modes: one sample - just call indel sites; two samples - call somatic-looking variants only. Still uses heuristic count-based cutoffs, cutoffs are hardcoded and are pretty conservative...
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@967 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
5451bbfd5a
commit
06e5a765f8
|
|
@ -1,25 +1,77 @@
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers.indels;
|
package org.broadinstitute.sting.playground.gatk.walkers.indels;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import net.sf.samtools.Cigar;
|
import net.sf.samtools.Cigar;
|
||||||
import net.sf.samtools.CigarElement;
|
import net.sf.samtools.CigarElement;
|
||||||
|
import net.sf.samtools.SAMFileReader;
|
||||||
|
import net.sf.samtools.SAMReadGroupRecord;
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
import org.broadinstitute.sting.playground.utils.CircularArray;
|
import org.broadinstitute.sting.playground.utils.CircularArray;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
|
||||||
|
|
||||||
public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
|
@Argument(fullName="bed", shortName="bed", doc="BED output file name", required=true)
|
||||||
|
public java.io.File bed_file;
|
||||||
|
@Argument(fullName="somatic", shortName="somatic",
|
||||||
|
doc="Perform somatic calls; two input alignment files must be specified", required=false)
|
||||||
|
public boolean call_somatic = false;
|
||||||
|
|
||||||
|
private static int WINDOW_SIZE = 200;
|
||||||
private RunningCoverage coverage;
|
private RunningCoverage coverage;
|
||||||
|
private RunningCoverage normal_coverage; // when performing somatic calls, we will be using this one for normal, and 'coverage' for tumor
|
||||||
private int currentContigIndex = -1;
|
private int currentContigIndex = -1;
|
||||||
|
private String refName = null;
|
||||||
|
private java.io.Writer output = null;
|
||||||
|
|
||||||
|
private Set<String> normal_samples = new HashSet<String>();
|
||||||
|
private Set<String> tumor_samples = new HashSet<String>();
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
coverage = new RunningCoverage(0,100);
|
coverage = new RunningCoverage(0,WINDOW_SIZE);
|
||||||
|
|
||||||
|
int nSams = getToolkit().getArguments().samFiles.size();
|
||||||
|
|
||||||
|
|
||||||
|
if ( call_somatic ) {
|
||||||
|
if ( nSams != 2 ) {
|
||||||
|
System.out.println("In --somatic mode two input bam files must be specified (normal/tumor)");
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
normal_coverage = new RunningCoverage(0,WINDOW_SIZE);
|
||||||
|
|
||||||
|
// this is an ugly hack: we want to be able to tell what file (tumor/normal sample) each read came from,
|
||||||
|
// but reads do not carry this information!
|
||||||
|
|
||||||
|
SAMFileReader rn = new SAMFileReader(getToolkit().getArguments().samFiles.get(0));
|
||||||
|
for ( SAMReadGroupRecord rec : rn.getFileHeader().getReadGroups() ) {
|
||||||
|
normal_samples.add(rec.getSample());
|
||||||
|
}
|
||||||
|
rn.close();
|
||||||
|
rn = new SAMFileReader(getToolkit().getArguments().samFiles.get(1));
|
||||||
|
for ( SAMReadGroupRecord rec : rn.getFileHeader().getReadGroups() ) {
|
||||||
|
tumor_samples.add(rec.getSample());
|
||||||
|
}
|
||||||
|
rn.close();
|
||||||
|
|
||||||
|
} else {
|
||||||
|
if ( nSams != 1 ) System.out.println("WARNING: multiple input files specified. \n"+
|
||||||
|
"WARNING: Without --somatic option they will be merged and processed as a single sample");
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
output = new java.io.FileWriter(bed_file);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new StingException("Failed to open file for writing BED output");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
@ -30,13 +82,19 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
read.getNotPrimaryAlignmentFlag() ||
|
read.getNotPrimaryAlignmentFlag() ||
|
||||||
read.getMappingQuality() == 0 ) return 0; // we do not need those reads!
|
read.getMappingQuality() == 0 ) return 0; // we do not need those reads!
|
||||||
|
|
||||||
if ( read.getReferenceIndex() != currentContigIndex ) {
|
|
||||||
|
if ( read.getReferenceIndex() != currentContigIndex ) {
|
||||||
|
// we just jumped onto a new contig
|
||||||
|
|
||||||
if ( read.getReferenceIndex() < currentContigIndex ) // paranoidal
|
if ( read.getReferenceIndex() < currentContigIndex ) // paranoidal
|
||||||
throw new StingException("Read "+read.getReadName()+": contig is out of order");
|
throw new StingException("Read "+read.getReadName()+": contig is out of order");
|
||||||
|
|
||||||
|
if ( call_somatic) emit_somatic(1000000000); // print remaining indels from the previous contig (if any);
|
||||||
|
else emit(1000000000);
|
||||||
currentContigIndex = read.getReferenceIndex();
|
currentContigIndex = read.getReferenceIndex();
|
||||||
|
refName = new String(read.getReferenceName());
|
||||||
coverage.clear(); // reset coverage window; this will also set reference position to 0
|
coverage.clear(); // reset coverage window; this will also set reference position to 0
|
||||||
|
if ( call_somatic) normal_coverage.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( read.getAlignmentStart() < coverage.getStart() ) {
|
if ( read.getAlignmentStart() < coverage.getStart() ) {
|
||||||
|
|
@ -46,30 +104,164 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
// reads are sorted; we are not going to see any more coverage or new indels prior
|
// reads are sorted; we are not going to see any more coverage or new indels prior
|
||||||
// to current read's start position!
|
// to current read's start position!
|
||||||
for ( long pos = coverage.getStart() ; pos < Math.min(read.getAlignmentStart(),coverage.getStop()+1) ; pos++ ) {
|
|
||||||
List<IndelVariant> variants = coverage.indelsAt(pos);
|
|
||||||
if ( variants.size() == 0 ) continue;
|
|
||||||
System.out.print(read.getReferenceName()+"\t"+pos+"\t"+coverage.coverageAt(pos));
|
|
||||||
for ( IndelVariant var : variants ) {
|
|
||||||
System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount());
|
|
||||||
}
|
|
||||||
System.out.println();
|
|
||||||
}
|
|
||||||
|
|
||||||
coverage.shift((int)(read.getAlignmentStart() - coverage.getStart() ) );
|
if ( call_somatic ) emit_somatic( read.getAlignmentStart() );
|
||||||
|
else emit( read.getAlignmentStart() );
|
||||||
|
|
||||||
|
|
||||||
if ( read.getAlignmentEnd() > coverage.getStop()) {
|
if ( read.getAlignmentEnd() > coverage.getStop()) {
|
||||||
// should never happen
|
// should never happen
|
||||||
throw new StingException("Read "+read.getReadName()+": out of coverage window bounds");
|
throw new StingException("Read "+read.getReadName()+": out of coverage window bounds.\n"+
|
||||||
|
"Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+
|
||||||
|
read.getAlignmentStart()+"; end="+read.getAlignmentEnd()+"; window start="+coverage.getStart()+
|
||||||
|
"; window end="+coverage.getStop());
|
||||||
}
|
}
|
||||||
|
|
||||||
coverage.add(read,ref);
|
if ( call_somatic ) {
|
||||||
|
String rg = (String)read.getAttribute("RG");
|
||||||
|
|
||||||
|
String sample = read.getHeader().getReadGroup(rg).getSample();
|
||||||
|
|
||||||
|
if ( normal_samples.contains(sample) ) {
|
||||||
|
normal_coverage.add(read,ref);
|
||||||
|
} else if ( tumor_samples.contains(sample) ) {
|
||||||
|
coverage.add(read,ref);
|
||||||
|
} else {
|
||||||
|
throw new StingException("Unrecognized sample: "+sample);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
coverage.add(read, ref);
|
||||||
|
}
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Output indel calls up to the specified position and shift the coverage array(s): after this method is executed
|
||||||
|
* first elements of the coverage arrays map onto 'position'
|
||||||
|
*
|
||||||
|
* @param position
|
||||||
|
*/
|
||||||
|
private void emit(long position) {
|
||||||
|
|
||||||
|
for ( long pos = coverage.getStart() ; pos < Math.min(position,coverage.getStop()+1) ; pos++ ) {
|
||||||
|
|
||||||
|
List<IndelVariant> variants = coverage.indelsAt(pos);
|
||||||
|
if ( variants.size() == 0 ) continue; // no indels
|
||||||
|
|
||||||
|
int cov = coverage.coverageAt(pos);
|
||||||
|
|
||||||
|
if ( cov < 6 ) continue; // low coverage
|
||||||
|
|
||||||
|
int total_variant_count = 0;
|
||||||
|
int max_variant_count = 0;
|
||||||
|
String indelString = null;
|
||||||
|
int event_length = 0; // length of the event on the reference
|
||||||
|
|
||||||
|
for ( IndelVariant var : variants ) {
|
||||||
|
int cnt = var.getCount();
|
||||||
|
total_variant_count +=cnt;
|
||||||
|
if ( cnt > max_variant_count ) {
|
||||||
|
max_variant_count = cnt;
|
||||||
|
indelString = var.getBases();
|
||||||
|
event_length = var.lengthOnRef();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( (double)total_variant_count > 0.3*cov && (double) max_variant_count > 0.7*total_variant_count ) {
|
||||||
|
|
||||||
|
try {
|
||||||
|
output.write(refName+"\t"+(pos-1)+"\t"+(event_length > 0 ? pos-1+event_length : pos-1)+
|
||||||
|
"\t"+(event_length>0? "-":"+")+indelString +":"+total_variant_count+"/"+cov+"\n");
|
||||||
|
} catch (IOException e) {
|
||||||
|
System.out.println(e.getMessage());
|
||||||
|
e.printStackTrace();
|
||||||
|
throw new StingException("Error encountered while writing into output BED file");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// for ( IndelVariant var : variants ) {
|
||||||
|
// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount());
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
|
||||||
|
coverage.shift((int)(position - coverage.getStart() ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Output somatic indel calls up to the specified position and shift the coverage array(s): after this method is executed
|
||||||
|
* first elements of the coverage arrays map onto 'position'
|
||||||
|
*
|
||||||
|
* @param position
|
||||||
|
*/
|
||||||
|
private void emit_somatic(long position) {
|
||||||
|
|
||||||
|
for ( long pos = coverage.getStart() ; pos < Math.min(position,coverage.getStop()+1) ; pos++ ) {
|
||||||
|
|
||||||
|
List<IndelVariant> tumor_variants = coverage.indelsAt(pos);
|
||||||
|
List<IndelVariant> normal_variants = normal_coverage.indelsAt(pos);
|
||||||
|
|
||||||
|
if ( tumor_variants.size() == 0 ) continue; // no indels in tumor
|
||||||
|
|
||||||
|
|
||||||
|
int tumor_cov = coverage.coverageAt(pos);
|
||||||
|
int normal_cov = normal_coverage.coverageAt(pos);
|
||||||
|
|
||||||
|
if ( tumor_cov < 6 ) continue; // low coverage
|
||||||
|
if ( normal_cov < 4 ) continue; // low coverage
|
||||||
|
|
||||||
|
int total_variant_count_tumor = 0;
|
||||||
|
int max_variant_count_tumor = 0;
|
||||||
|
String indelStringTumor = null;
|
||||||
|
int event_length_tumor = 0; // length of the event on the reference
|
||||||
|
|
||||||
|
for ( IndelVariant var : tumor_variants ) {
|
||||||
|
int cnt = var.getCount();
|
||||||
|
total_variant_count_tumor +=cnt;
|
||||||
|
if ( cnt > max_variant_count_tumor ) {
|
||||||
|
max_variant_count_tumor = cnt;
|
||||||
|
indelStringTumor = var.getBases();
|
||||||
|
event_length_tumor = var.lengthOnRef();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( (double)total_variant_count_tumor > 0.3*tumor_cov && (double) max_variant_count_tumor > 0.7*total_variant_count_tumor ) {
|
||||||
|
|
||||||
|
System.out.println(refName+"\t"+(pos-1)+"\t"+(event_length_tumor > 0 ? pos-1+event_length_tumor : pos-1)+
|
||||||
|
"\t"+(event_length_tumor >0? "-":"+")+indelStringTumor +":"+total_variant_count_tumor+"/"+tumor_cov);
|
||||||
|
if ( normal_variants.size() == 0 ) {
|
||||||
|
|
||||||
|
try {
|
||||||
|
output.write(refName+"\t"+(pos-1)+"\t"+(event_length_tumor > 0 ? pos-1+event_length_tumor : pos-1)+
|
||||||
|
"\t"+(event_length_tumor >0? "-":"+")+indelStringTumor +":"+total_variant_count_tumor+"/"+tumor_cov+"\n");
|
||||||
|
} catch (IOException e) {
|
||||||
|
System.out.println(e.getMessage());
|
||||||
|
e.printStackTrace();
|
||||||
|
throw new StingException("Error encountered while writing into output BED file");
|
||||||
|
}
|
||||||
|
System.out.println("INDICATION FOR SOMATIC\n---------------------------\n");
|
||||||
|
} else {
|
||||||
|
System.out.println("INDICATION FOR GERMLINE\n---------------------------\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// for ( IndelVariant var : variants ) {
|
||||||
|
// System.out.print("\t"+var.getType()+"\t"+var.getBases()+"\t"+var.getCount());
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
|
||||||
|
coverage.shift((int)(position - coverage.getStart() ) );
|
||||||
|
normal_coverage.shift((int)(position - normal_coverage.getStart() ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onTraversalDone(Integer result) {
|
||||||
|
emit(1000000000); // emit everything we might have left
|
||||||
|
try {
|
||||||
|
output.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
System.out.println("Failed to close output BED file gracefully, data may be lost");
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
super.onTraversalDone(result);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Integer reduce(Integer value, Integer sum) {
|
public Integer reduce(Integer value, Integer sum) {
|
||||||
sum += value;
|
sum += value;
|
||||||
|
|
@ -99,6 +291,15 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
count += i;
|
count += i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns length of the event on the reference (number of deleted bases
|
||||||
|
* for deletions, -1 for insertions.
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public int lengthOnRef() {
|
||||||
|
if ( type == Type.D ) return bases.length();
|
||||||
|
else return -1;
|
||||||
|
}
|
||||||
|
|
||||||
public void increment() { count+=1; }
|
public void increment() { count+=1; }
|
||||||
|
|
||||||
public int getCount() { return count; }
|
public int getCount() { return count; }
|
||||||
|
|
@ -235,8 +436,8 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
if ( type == null ) continue; // element was not an indel, go grab next element...
|
if ( type == null ) continue; // element was not an indel, go grab next element...
|
||||||
|
|
||||||
// we got an indel if we are here...
|
// we got an indel if we are here...
|
||||||
if ( i == 0 ) System.out.println("WARNING: Indel at the start of the read "+r.getReadName());
|
if ( i == 0 ) logger.warn("Indel at the start of the read "+r.getReadName());
|
||||||
if ( i == nCigarElems - 1) System.out.println("WARNING: Indel at the end of the read "+r.getReadName());
|
if ( i == nCigarElems - 1) logger.warn("Indel at the end of the read "+r.getReadName());
|
||||||
|
|
||||||
updateCount(indelPosition, type, bases);
|
updateCount(indelPosition, type, bases);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue