Reorganized init() to check for RODs (reference / truth)
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6050 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
6f5a08ddc6
commit
3f974c62e6
|
|
@ -8,13 +8,16 @@ import org.broadinstitute.sting.commandline.Output;
|
||||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData;
|
||||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||||
import org.broadinstitute.sting.utils.MathUtils;
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
|
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import org.apache.commons.math.util.*;
|
import org.apache.commons.math.util.*;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
@ -30,8 +33,6 @@ import java.util.regex.Pattern;
|
||||||
* A reference sample name must be provided and it must be barcoded uniquely.
|
* A reference sample name must be provided and it must be barcoded uniquely.
|
||||||
*/
|
*/
|
||||||
public class ReplicationValidationWalker extends LocusWalker<Integer, Long> implements TreeReducible<Long> {
|
public class ReplicationValidationWalker extends LocusWalker<Integer, Long> implements TreeReducible<Long> {
|
||||||
@Argument(shortName="refSample", fullName="reference_sample_name", doc="reference sample name", required=true)
|
|
||||||
String referenceSampleName = "NA12878";
|
|
||||||
|
|
||||||
@Argument(shortName="nsamples", fullName="number_of_samples", doc="Number of samples in the dataset (not counting the reference sample).", required=true)
|
@Argument(shortName="nsamples", fullName="number_of_samples", doc="Number of samples in the dataset (not counting the reference sample).", required=true)
|
||||||
int nSamples = -1;
|
int nSamples = -1;
|
||||||
|
|
@ -52,36 +53,28 @@ public class ReplicationValidationWalker extends LocusWalker<Integer, Long> impl
|
||||||
PrintStream out;
|
PrintStream out;
|
||||||
|
|
||||||
int maxAlleleCount;
|
int maxAlleleCount;
|
||||||
List<LaneData> lanes;
|
String referenceSampleName;
|
||||||
|
boolean USE_TRUTH_ROD;
|
||||||
|
|
||||||
private class LaneData {
|
final String REFERENCE_ROD_NAME = "reference";
|
||||||
String id;
|
final String TRUTH_ROD_NAME = "truth";
|
||||||
List <String> sampleNames;
|
|
||||||
String referenceSample;
|
|
||||||
|
|
||||||
public String getId() {
|
|
||||||
return id;
|
/**
|
||||||
}
|
* GATK Engine creates readgroups of the form XXX.Y.Z
|
||||||
|
* XXX.Y is the unique lane identifier.
|
||||||
public void setId(String id) {
|
* Z is the id of the sample to make the read group id unique
|
||||||
this.id = id;
|
* This function returns a list of unique lane identifiers.
|
||||||
}
|
* @param readGroups readGroups A collection of read group strings (obtained from the alignment context pileup)
|
||||||
|
* @return a collection of lane ids.
|
||||||
public List<String> getSampleNames() {
|
*/
|
||||||
return sampleNames;
|
private Set<String> getLaneIDs(Collection<String> readGroups) {
|
||||||
}
|
HashSet<String> result = new HashSet<String>();
|
||||||
|
for (String rgid : readGroups) {
|
||||||
public void setSampleNames(List<String> sampleNames) {
|
String [] parsedId = rgid.split("\\.");
|
||||||
this.sampleNames = sampleNames;
|
result.add(parsedId[0] + "." + parsedId[1]);
|
||||||
}
|
|
||||||
|
|
||||||
public String getReferenceSample() {
|
|
||||||
return referenceSample;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setReferenceSample(String referenceSample) {
|
|
||||||
this.referenceSample = referenceSample;
|
|
||||||
}
|
}
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -107,7 +100,12 @@ public class ReplicationValidationWalker extends LocusWalker<Integer, Long> impl
|
||||||
return model;
|
return model;
|
||||||
}
|
}
|
||||||
|
|
||||||
// buildErrorModel helper functions
|
/**
|
||||||
|
* Returns the number of mismatching bases in a pileup
|
||||||
|
* @param data the bases of a pileup
|
||||||
|
* @param refBase the reference sample base to compare to
|
||||||
|
* @return number of bases in data that are different from refBase
|
||||||
|
*/
|
||||||
private int getNumberOfMismatches(byte[] data, byte refBase) {
|
private int getNumberOfMismatches(byte[] data, byte refBase) {
|
||||||
int mismatches = 0;
|
int mismatches = 0;
|
||||||
for (byte seqBase : data) {
|
for (byte seqBase : data) {
|
||||||
|
|
@ -117,41 +115,35 @@ public class ReplicationValidationWalker extends LocusWalker<Integer, Long> impl
|
||||||
return mismatches;
|
return mismatches;
|
||||||
}
|
}
|
||||||
|
|
||||||
// private String getLaneId (SAMReadGroupRecord rg) {
|
|
||||||
// String [] idParts = rg.getPlatformUnit().split("\\.");
|
|
||||||
// return idParts[0] + idParts[1]; // 0 is run_id, 1 is lane_number.
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// private List<LaneData> createLanesList () {
|
|
||||||
// List<SAMReadGroupRecord> rgs = getToolkit().getSAMFileHeader().getReadGroups();
|
|
||||||
// for(SAMReadGroupRecord rg : rgs) {
|
|
||||||
// String laneId = getLaneId(rg);
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// return new LinkedList<LaneData>();
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// private int getNumberOfSamples() {
|
|
||||||
// int nSamples = 0;
|
|
||||||
// List<Set<String>> readers = getToolkit().getSamplesByReaders();
|
|
||||||
// for(Set<String> reader : readers) {
|
|
||||||
// nSamples += reader.size();
|
|
||||||
// if (reader.contains(referenceSampleName))
|
|
||||||
// nSamples--;
|
|
||||||
// }
|
|
||||||
// return nSamples;
|
|
||||||
// }
|
|
||||||
|
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
if (overrideMaxAlleleCount > 0)
|
|
||||||
maxAlleleCount = overrideMaxAlleleCount;
|
// Set the max allele count (defines the size of the error model array)
|
||||||
else
|
maxAlleleCount = (overrideMaxAlleleCount > 0) ? overrideMaxAlleleCount : nSamples*nChromosomes;
|
||||||
maxAlleleCount = nSamples*nChromosomes;
|
|
||||||
|
// Look for the reference ROD and the optional truth ROD. If truth is provided, set the truth "test" mode ON.
|
||||||
|
List<ReferenceOrderedDataSource> rods = getToolkit().getRodDataSources();
|
||||||
|
if (rods.size() < 1) {
|
||||||
|
throw new IllegalArgumentException("You must provide a reference ROD.");
|
||||||
|
}
|
||||||
|
boolean foundReferenceROD = false;
|
||||||
|
boolean foundTruthROD = false;
|
||||||
|
for (ReferenceOrderedDataSource rod : rods) {
|
||||||
|
if (rod.getName().equals(REFERENCE_ROD_NAME)) {
|
||||||
|
foundReferenceROD = true;
|
||||||
|
}
|
||||||
|
if (rod.getName().equals(TRUTH_ROD_NAME)) {
|
||||||
|
foundReferenceROD = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!foundReferenceROD) {
|
||||||
|
throw new IllegalArgumentException("You haven't provided a reference ROD. Note that the reference ROD must be labeled " + REFERENCE_ROD_NAME + ".");
|
||||||
|
}
|
||||||
|
if (rods.size() > 1 && !foundTruthROD) {
|
||||||
|
throw new IllegalArgumentException("You haven't provided a truth ROD. Note that the reference ROD must be labeled " + TRUTH_ROD_NAME + ".");
|
||||||
|
}
|
||||||
|
USE_TRUTH_ROD = foundTruthROD;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
ReadBackedPileup contextPileup = context.getBasePileup();
|
ReadBackedPileup contextPileup = context.getBasePileup();
|
||||||
Set<String> lanesInLocus = getLaneIDs(contextPileup.getReadGroups());
|
Set<String> lanesInLocus = getLaneIDs(contextPileup.getReadGroups());
|
||||||
|
|
@ -175,7 +167,9 @@ public class ReplicationValidationWalker extends LocusWalker<Integer, Long> impl
|
||||||
if (referenceSamplePileup != null && !referenceSamplePileup.isEmpty() && !poolPileup.isEmpty()) {
|
if (referenceSamplePileup != null && !referenceSamplePileup.isEmpty() && !poolPileup.isEmpty()) {
|
||||||
|
|
||||||
// Build error model
|
// Build error model
|
||||||
double [] errorModel = buildErrorModel(poolPileup.getBases(), referenceSamplePileup.getBases()[0], defaultPrior);
|
//TODO: Build a reference sample VCF and get true reference base from there.
|
||||||
|
byte trueReferenceBase = referenceSamplePileup.getBases()[0]; // THIS IS A QUICK FIX JUST TO BE ABLE TO TEST -- IT IS WRONG!
|
||||||
|
double [] errorModel = buildErrorModel(referenceSamplePileup.getBases(), trueReferenceBase, defaultPrior);
|
||||||
|
|
||||||
// Debug error model
|
// Debug error model
|
||||||
System.out.println(laneID + ":");
|
System.out.println(laneID + ":");
|
||||||
|
|
@ -191,44 +185,8 @@ public class ReplicationValidationWalker extends LocusWalker<Integer, Long> impl
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* GATK Engine creates readgroups of the form XXX.Y.Z
|
|
||||||
* XXX.Y is the unique lane identifier.
|
|
||||||
* Z is the id of the sample to make the read group id unique
|
|
||||||
* This function returns a map that groups together all samples that were run in the same lane:
|
|
||||||
* XXX.Y -> {Z1, Z2, Z3, ... Zn}
|
|
||||||
* @param readGroups A collection of read group strings (obtained from the alignment context pileup)
|
|
||||||
* @return A map of lanes and samples, as described above
|
|
||||||
*/
|
|
||||||
// private HashMap<String, ArrayList<String>> decodeReadIDs(Collection<String> readGroups) {
|
|
||||||
// HashMap<String, ArrayList<String>> result = new HashMap<String, ArrayList<String>>();
|
|
||||||
// for (String rgid : readGroups) {
|
|
||||||
// String [] parsedId = rgid.split("\\.");
|
|
||||||
// String lane = parsedId[0] + "." + parsedId[1];
|
|
||||||
// String id = parsedId[2];
|
|
||||||
// ArrayList<String> idList = (result.get(lane) == null) ? (new ArrayList<String>()) : result.get(lane);
|
|
||||||
// idList.add(id);
|
|
||||||
// result.put(lane, idList);
|
|
||||||
// }
|
|
||||||
// return result;
|
|
||||||
// }
|
|
||||||
|
|
||||||
/**
|
|
||||||
* GATK Engine creates readgroups of the form XXX.Y.Z
|
|
||||||
* XXX.Y is the unique lane identifier.
|
|
||||||
* Z is the id of the sample to make the read group id unique
|
|
||||||
* This function returns a list of unique lane identifiers.
|
|
||||||
* @param readGroups readGroups A collection of read group strings (obtained from the alignment context pileup)
|
|
||||||
* @return a collection of lane ids.
|
|
||||||
*/
|
|
||||||
private Set<String> getLaneIDs(Collection<String> readGroups) {
|
|
||||||
HashSet<String> result = new HashSet<String>();
|
|
||||||
for (String rgid : readGroups) {
|
|
||||||
String [] parsedId = rgid.split("\\.");
|
|
||||||
result.add(parsedId[0] + "." + parsedId[1]);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Long reduceInit() {
|
public Long reduceInit() {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue