IndelGenotyper now uses GATK::getMergedReadGroupsByReaders() to sort out which read in the merged stream is for normal, and which is for tumor (in --somatic mode, apparently)
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1316 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a361e7b342
commit
5eca4c353c
|
|
@ -1,16 +1,13 @@
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers.indels;
|
package org.broadinstitute.sting.playground.gatk.walkers.indels;
|
||||||
|
|
||||||
import net.sf.picard.sam.SamFileHeaderMerger;
|
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.Reads;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.RODIterator;
|
import org.broadinstitute.sting.gatk.refdata.RODIterator;
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||||
import org.broadinstitute.sting.gatk.refdata.Transcript;
|
import org.broadinstitute.sting.gatk.refdata.Transcript;
|
||||||
import org.broadinstitute.sting.gatk.refdata.rodRefSeq;
|
import org.broadinstitute.sting.gatk.refdata.rodRefSeq;
|
||||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
||||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
|
||||||
import org.broadinstitute.sting.gatk.filters.Platform454Filter;
|
import org.broadinstitute.sting.gatk.filters.Platform454Filter;
|
||||||
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
||||||
import org.broadinstitute.sting.playground.utils.CircularArray;
|
import org.broadinstitute.sting.playground.utils.CircularArray;
|
||||||
|
|
@ -69,8 +66,8 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
private RODIterator<rodRefSeq> refseqIterator=null;
|
private RODIterator<rodRefSeq> refseqIterator=null;
|
||||||
|
|
||||||
private Set<String> normal_samples = new HashSet<String>();
|
private Set<String> normalReadGroups;
|
||||||
private Set<String> tumor_samples = new HashSet<String>();
|
private Set<String> tumorReadGroups ;
|
||||||
|
|
||||||
private int MISMATCH_WIDTH = 5; // 5 bases on each side of the indel
|
private int MISMATCH_WIDTH = 5; // 5 bases on each side of the indel
|
||||||
private int MISMATCH_CUTOFF = 1000000;
|
private int MISMATCH_CUTOFF = 1000000;
|
||||||
|
|
@ -105,6 +102,8 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
location = GenomeLocParser.createGenomeLoc(0,1);
|
location = GenomeLocParser.createGenomeLoc(0,1);
|
||||||
|
|
||||||
|
List<Set<String>> readGroupSets = getToolkit().getMergedReadGroupsByReaders();
|
||||||
|
|
||||||
if ( call_somatic ) {
|
if ( call_somatic ) {
|
||||||
if ( nSams != 2 ) {
|
if ( nSams != 2 ) {
|
||||||
System.out.println("In --somatic mode two input bam files must be specified (normal/tumor)");
|
System.out.println("In --somatic mode two input bam files must be specified (normal/tumor)");
|
||||||
|
|
@ -112,24 +111,35 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
}
|
}
|
||||||
normal_coverage = new RunningCoverage(0,WINDOW_SIZE);
|
normal_coverage = new RunningCoverage(0,WINDOW_SIZE);
|
||||||
|
|
||||||
// this is an ugly hack: we want to be able to tell what file (tumor/normal sample) each read came from,
|
normalReadGroups = readGroupSets.get(0); // first -I option must specify normal.bam
|
||||||
// but reads do not carry this information!
|
tumorReadGroups = readGroupSets.get(1); // second -I option must specify tumor.bam
|
||||||
|
|
||||||
// SAMFileReader rn = new SAMFileReader(getToolkit().getArguments().samFiles.get(0));
|
|
||||||
// for ( SAMReadGroupRecord rec : rn.getFileHeader().getReadGroups() ) {
|
|
||||||
// normal_samples.add(rec.getSample());
|
|
||||||
// }
|
|
||||||
// rn.close();
|
|
||||||
// rn = new SAMFileReader(getToolkit().getArguments().samFiles.get(1));
|
|
||||||
// for ( SAMReadGroupRecord rec : rn.getFileHeader().getReadGroups() ) {
|
|
||||||
// tumor_samples.add(rec.getSample());
|
|
||||||
// }
|
|
||||||
// rn.close();
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
if ( nSams != 1 ) System.out.println("WARNING: multiple input files specified. \n"+
|
if ( nSams != 1 ) System.out.println("WARNING: multiple input files specified. \n"+
|
||||||
"WARNING: Without --somatic option they will be merged and processed as a single sample");
|
"WARNING: Without --somatic option they will be merged and processed as a single sample");
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
|
List<Set<String>> sample_sets = getToolkit().getSamplesByReaders();
|
||||||
|
for ( int i = 0 ; i < sample_sets.size() ; i++ ) {
|
||||||
|
System.out.print("Reader "+i);
|
||||||
|
for ( String s : sample_sets.get(i) ) System.out.print(" " + s);
|
||||||
|
System.out.println();
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Set<String>> lib_sets = getToolkit().getLibrariesByReaders();
|
||||||
|
for ( int i = 0 ; i < lib_sets.size() ; i++ ) {
|
||||||
|
System.out.print("Reader "+i);
|
||||||
|
for ( String s : lib_sets.get(i) ) System.out.print(" " + s);
|
||||||
|
System.out.println();
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
for ( int i = 0 ; i < readGroupSets.size() ; i++ ) {
|
||||||
|
System.out.print("Reader "+i);
|
||||||
|
for ( String s : readGroupSets.get(i) ) System.out.print(" " + s);
|
||||||
|
System.out.println();
|
||||||
|
}
|
||||||
|
assignReadGroups1();
|
||||||
|
*/
|
||||||
try {
|
try {
|
||||||
output = new java.io.FileWriter(bed_file);
|
output = new java.io.FileWriter(bed_file);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
|
@ -137,6 +147,7 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
void assignReadGroups(final SAMFileHeader mergedHeader) {
|
void assignReadGroups(final SAMFileHeader mergedHeader) {
|
||||||
|
|
||||||
Set<String> normal = new HashSet<String>(); // list normal samples here
|
Set<String> normal = new HashSet<String>(); // list normal samples here
|
||||||
|
|
@ -167,74 +178,19 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
void assignReadGroups1(final SAMFileHeader mergedHeader) {
|
|
||||||
|
|
||||||
SAMDataSource d = getToolkit().getDataSource();
|
|
||||||
|
|
||||||
Reads reads = d.getReadsInfo();
|
|
||||||
|
|
||||||
System.out.println(reads.getReadsFiles().size() + " input files");
|
|
||||||
|
|
||||||
System.out.println(d.getHeaderMerger().getReaders().size() +" readers instantiated");
|
|
||||||
for ( SAMFileReader r : d.getHeaderMerger().getReaders() ) {
|
|
||||||
System.out.print("----------\nread groups:");
|
|
||||||
for ( SAMReadGroupRecord g : r.getFileHeader().getReadGroups() ) {
|
|
||||||
System.out.print(" "+g.getReadGroupId()+ " ("+g.getSample()+":"+g.getLibrary()+")");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
System.exit(1);
|
|
||||||
|
|
||||||
Set<String> normal = new HashSet<String>(); // list normal samples here
|
|
||||||
Set<String> tumor = new HashSet<String>(); // list tumor samples here
|
|
||||||
|
|
||||||
SAMFileReader rn = new SAMFileReader(getToolkit().getArguments().samFiles.get(0));
|
|
||||||
for ( SAMReadGroupRecord rec : rn.getFileHeader().getReadGroups() ) {
|
|
||||||
normal.add(new String(rec.getSample()));
|
|
||||||
}
|
|
||||||
rn.close();
|
|
||||||
rn = new SAMFileReader(getToolkit().getArguments().samFiles.get(1));
|
|
||||||
for ( SAMReadGroupRecord rec : rn.getFileHeader().getReadGroups() ) {
|
|
||||||
tumor.add(new String(rec.getSample()));
|
|
||||||
}
|
|
||||||
rn.close();
|
|
||||||
|
|
||||||
// now we know what samples are normal, and what are tumor; let's assign dynamic read groups we get in merged header:
|
|
||||||
for ( SAMReadGroupRecord mr : mergedHeader.getReadGroups() ) {
|
|
||||||
if ( normal.contains(mr.getSample() ) ) {
|
|
||||||
normal_samples.add( new String(mr.getReadGroupId()) );
|
|
||||||
System.out.println("Read group "+ mr.getReadGroupId() + "--> Sample "+ mr.getSample() + " (normal)");
|
|
||||||
} else if ( tumor.contains(mr.getSample() ) ) {
|
|
||||||
tumor_samples.add( new String(mr.getReadGroupId()) );
|
|
||||||
System.out.println("Read group "+ mr.getReadGroupId() + "--> Sample "+ mr.getSample() + " (tumor)");
|
|
||||||
} else throw new StingException("Unrecognized sample "+mr.getSample() +" in merged SAM stream");
|
|
||||||
}
|
|
||||||
System.out.println();
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Integer map(char[] ref, SAMRecord read) {
|
public Integer map(char[] ref, SAMRecord read) {
|
||||||
|
|
||||||
// if ( read.getReadUnmappedFlag() && read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX &&
|
|
||||||
// read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ) {
|
|
||||||
// System.out.println("I think I reached unmapped reads at the end of the file(s) and I am done...");
|
|
||||||
// return 0;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// if ( read.getAlignmentStart() < 112337549 && read.getAlignmentEnd() > 112337550 ) {
|
|
||||||
// System.out.print("adding "+read.getReadString()+" "+read.getCigarString());
|
|
||||||
// }
|
|
||||||
|
|
||||||
if ( read.getReadUnmappedFlag() ||
|
if ( read.getReadUnmappedFlag() ||
|
||||||
read.getDuplicateReadFlag() ||
|
read.getDuplicateReadFlag() ||
|
||||||
read.getNotPrimaryAlignmentFlag() ||
|
read.getNotPrimaryAlignmentFlag() ||
|
||||||
read.getMappingQuality() == 0 ) {
|
read.getMappingQuality() == 0 ) {
|
||||||
// System.out.println(" ignored");
|
|
||||||
return 0; // we do not need those reads!
|
return 0; // we do not need those reads!
|
||||||
}
|
}
|
||||||
// System.out.print(" added");
|
|
||||||
|
|
||||||
if ( read.getReferenceIndex() != currentContigIndex ) {
|
if ( read.getReferenceIndex() != currentContigIndex ) {
|
||||||
// we just jumped onto a new contig
|
// we just jumped onto a new contig
|
||||||
|
|
@ -249,7 +205,6 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
refName = new String(read.getReferenceName());
|
refName = new String(read.getReferenceName());
|
||||||
|
|
||||||
location = GenomeLocParser.setContig(location,refName);
|
location = GenomeLocParser.setContig(location,refName);
|
||||||
// location.setContigIndex(read.getReferenceIndex());
|
|
||||||
|
|
||||||
coverage.clear(); // reset coverage window; this will also set reference position to 0
|
coverage.clear(); // reset coverage window; this will also set reference position to 0
|
||||||
if ( call_somatic) normal_coverage.clear();
|
if ( call_somatic) normal_coverage.clear();
|
||||||
|
|
@ -306,19 +261,12 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
if ( call_somatic ) {
|
if ( call_somatic ) {
|
||||||
|
|
||||||
// this is a hack. currently we can get access to the merged header only through the read,
|
|
||||||
// so below we figure out which of the reassigned read groups in the merged stream are normal
|
|
||||||
// and which are tumor, and we make sure we do it only once:
|
|
||||||
if ( normal_samples.size() == 0 ) assignReadGroups(read.getHeader());
|
|
||||||
|
|
||||||
String rg = (String)read.getAttribute("RG");
|
String rg = (String)read.getAttribute("RG");
|
||||||
if ( rg == null ) throw new StingException("Read "+read.getReadName()+" has no read group in merged stream");
|
if ( rg == null ) throw new StingException("Read "+read.getReadName()+" has no read group in merged stream");
|
||||||
|
|
||||||
if ( normal_samples.contains(rg) ) {
|
if ( normalReadGroups.contains(rg) ) {
|
||||||
// System.out.println(" TO NORMAL");
|
|
||||||
normal_coverage.add(read,ref);
|
normal_coverage.add(read,ref);
|
||||||
} else if ( tumor_samples.contains(rg) ) {
|
} else if ( tumorReadGroups.contains(rg) ) {
|
||||||
// System.out.println(" TO TUMOR");
|
|
||||||
coverage.add(read,ref);
|
coverage.add(read,ref);
|
||||||
} else {
|
} else {
|
||||||
throw new StingException("Unrecognized read group in merged stream: "+rg);
|
throw new StingException("Unrecognized read group in merged stream: "+rg);
|
||||||
|
|
@ -653,7 +601,7 @@ public class IndelGenotyperWalker extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
public String getSamples() {
|
public String getSamples() {
|
||||||
StringBuffer sb = new StringBuffer();
|
StringBuffer sb = new StringBuffer();
|
||||||
Iterator i = samples.iterator();
|
Iterator<String> i = samples.iterator();
|
||||||
while ( i.hasNext() ) {
|
while ( i.hasNext() ) {
|
||||||
sb.append(i.next());
|
sb.append(i.next());
|
||||||
if ( i.hasNext() )
|
if ( i.hasNext() )
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue