gatk-3.8/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java

331 lines
14 KiB
Java
Raw Normal View History

/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* SampleUtils is a static class (no instantiation allowed!) with some utility methods for getting samples
* quality scores.
*
* @author ebanks
*/
public class SampleUtils {
/**
* Private constructor. No instantiating this class!
*/
private SampleUtils() {}
/**
* Pull out the samples from a SAMFileHeader;
* note that we use a TreeSet so that they are sorted
*
* @param header the sam file header
* @return list of strings representing the sample names
*/
public static Set<String> getSAMFileSamples(SAMFileHeader header) {
// get all of the unique sample names
Set<String> samples = new TreeSet<String>();
List<SAMReadGroupRecord> readGroups = header.getReadGroups();
for ( SAMReadGroupRecord readGroup : readGroups )
samples.add(readGroup.getSample());
return samples;
}
/**
* Gets all of the unique sample names from all VCF rods input by the user
*
* @param toolkit GATK engine
*
* @return the set of unique samples
*/
public static Set<String> getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit) {
return getUniqueSamplesFromRods(toolkit, null);
}
/**
* Gets all of the unique sample names from the set of provided VCF rod names input by the user
*
* @param toolkit GATK engine
* @param rodNames list of rods to use; if null, uses all VCF rods
*
* @return the set of unique samples
*/
public static Set<String> getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit, Collection<String> rodNames) {
Set<String> samples = new LinkedHashSet<String>();
for ( VCFHeader header : VCFUtils.getVCFHeadersFromRods(toolkit, rodNames).values() )
samples.addAll(header.getGenotypeSamples());
return samples;
}
public static Set<String> getRodNamesWithVCFHeader(GenomeAnalysisEngine toolkit, Collection<String> rodNames) {
return VCFUtils.getVCFHeadersFromRods(toolkit, rodNames).keySet();
}
public static Set<String> getSampleListWithVCFHeader(GenomeAnalysisEngine toolkit, Collection<String> rodNames) {
return getSampleList(VCFUtils.getVCFHeadersFromRods(toolkit, rodNames));
}
public static Set<String> getSampleList(Map<String, VCFHeader> headers) {
return getSampleList(headers, VariantContextUtils.GenotypeMergeType.PRIORITIZE);
}
public static Set<String> getSampleList(Map<String, VCFHeader> headers, VariantContextUtils.GenotypeMergeType mergeOption) {
Set<String> samples = new TreeSet<String>();
for ( Map.Entry<String, VCFHeader> val : headers.entrySet() ) {
VCFHeader header = val.getValue();
for ( String sample : header.getGenotypeSamples() ) {
samples.add(VariantContextUtils.mergedSampleName(val.getKey(), sample, mergeOption == VariantContextUtils.GenotypeMergeType.UNIQUIFY));
}
}
return samples;
}
/**
* Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap
* (e.g. sampleX.1, sampleX.2, ...)
* When finished, samples contains the uniquified sample names and rodNamesToSampleNames contains a mapping
* from rod/sample pairs to the new uniquified names
*
* @param toolkit GATK engine
* @param samples set to store the sample names
* @param rodNamesToSampleNames mapping of rod/sample pairs to new uniquified sample names
*/
public static void getUniquifiedSamplesFromRods(GenomeAnalysisEngine toolkit, Set<String> samples, Map<Pair<String, String>, String> rodNamesToSampleNames) {
// keep a map of sample name to occurrences encountered
HashMap<String, Integer> sampleOverlapMap = new HashMap<String, Integer>();
// iterate to get all of the sample names
for ( Map.Entry<String, VCFHeader> pair : VCFUtils.getVCFHeadersFromRods(toolkit, null).entrySet() ) {
Set<String> vcfSamples = pair.getValue().getGenotypeSamples();
for ( String sample : vcfSamples )
addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, pair.getKey());
}
}
private static void addUniqueSample(Set<String> samples, Map<String, Integer> sampleOverlapMap, Map<Pair<String, String>, String> rodNamesToSampleNames, String newSample, String rodName) {
// how many occurrences have we seen so far?
Integer occurrences = sampleOverlapMap.get(newSample);
// if this is the first one, just add it to the list of samples
if ( occurrences == null ) {
samples.add(newSample);
rodNamesToSampleNames.put(new Pair<String, String>(rodName, newSample), newSample);
sampleOverlapMap.put(newSample, 1);
}
// if it's already been seen multiple times, give it a unique suffix and increment the value
else if ( occurrences >= 2 ) {
String uniqueName = newSample + "." + rodName;
samples.add(uniqueName);
rodNamesToSampleNames.put(new Pair<String, String>(rodName, newSample), uniqueName);
sampleOverlapMap.put(newSample, occurrences + 1);
}
// if this is the second occurrence of the sample name, uniquify both of them
else { // occurrences == 2
// remove the 1st occurrence, uniquify it, and add it back
samples.remove(newSample);
String uniqueName1 = null;
for ( Map.Entry<Pair<String, String>, String> entry : rodNamesToSampleNames.entrySet() ) {
if ( entry.getValue().equals(newSample) ) {
uniqueName1 = newSample + "." + entry.getKey().first;
entry.setValue(uniqueName1);
break;
}
}
samples.add(uniqueName1);
// add the second one
String uniqueName2 = newSample + "." + rodName;
samples.add(uniqueName2);
rodNamesToSampleNames.put(new Pair<String, String>(rodName, newSample), uniqueName2);
sampleOverlapMap.put(newSample, 2);
}
}
/**
* Returns a new set of samples, containing a final list of samples expanded from sampleArgs
*
* Each element E of sampleArgs can either be a literal sample name or a file. For each E,
* we try to read a file named E from disk, and if possible all lines from that file are expanded
* into unique sample names.
*
* @param sampleArgs
* @return
*/
public static Set<String> getSamplesFromCommandLineInput(Collection<String> sampleArgs) {
if (sampleArgs != null) {
// Let's first go through the list and see if we were given any files. We'll add every entry in the file to our
// sample list set, and treat the entries as if they had been specified on the command line.
Set<String> samplesFromFiles = new HashSet<String>();
for (String SAMPLE_EXPRESSION : sampleArgs) {
File sampleFile = new File(SAMPLE_EXPRESSION);
try {
XReadLines reader = new XReadLines(sampleFile);
List<String> lines = reader.readLines();
for (String line : lines) {
samplesFromFiles.add(line.trim());
}
} catch (FileNotFoundException e) {
samplesFromFiles.add(SAMPLE_EXPRESSION); // not a file, so must be a sample
}
}
return samplesFromFiles;
}
return new HashSet<String>();
}
public static Set<String> getSamplesFromCommandLineInput(Collection<String> vcfSamples, Collection<String> sampleExpressions) {
Set<String> samples = new HashSet<String>();
if (sampleExpressions != null) {
// Let's first go through the list and see if we were given any files. We'll add every entry in the file to our
// sample list set, and treat the entries as if they had been specified on the command line.
Set<String> samplesFromFiles = new HashSet<String>();
for (String sampleExpression : sampleExpressions) {
File sampleFile = new File(sampleExpression);
try {
XReadLines reader = new XReadLines(sampleFile);
List<String> lines = reader.readLines();
for (String line : lines) {
samplesFromFiles.add(line);
}
} catch (FileNotFoundException e) {
// ignore exception
}
}
sampleExpressions.addAll(samplesFromFiles);
// Let's now assume that the values in sampleExpressions are literal sample names and not regular
// expressions. Extract those samples specifically so we don't make the mistake of selecting more
// than what the user really wants.
Set<String> possibleSampleRegexs = new HashSet<String>();
for (String sampleExpression : sampleExpressions) {
if (!(new File(sampleExpression).exists())) {
if (vcfSamples.contains(sampleExpression)) {
samples.add(sampleExpression);
} else {
possibleSampleRegexs.add(sampleExpression);
}
}
}
// Now, check the expressions that weren't used in the previous step, and use them as if they're regular expressions
for (String sampleRegex : possibleSampleRegexs) {
Pattern p = Pattern.compile(sampleRegex);
for (String vcfSample : vcfSamples) {
Matcher m = p.matcher(vcfSample);
if (m.find()) {
samples.add(vcfSample);
}
}
}
} else {
samples.addAll(vcfSamples);
}
return samples;
}
Many updates to SelectVariants : 1) There is now a different parameter for sample name (-sn), sample file (-sf) or sample expression (-se). The unexpected behavior of the previous implementation was way too tricky to leave unchecked. (if you had a file or directory named after a sample name, SV wouldn't work) 1b) Fixed a TODO added by Eric -- now the output vcf always has the samples sorted alphabetically regardless of input (this came as a byproduct of the implementation of 1) 2) Discordance and Concordance now work in combination with all other parameters. 3) Discordance now follows Guillermo's suggestion where the discordance track is your VCF and the variant track is the one you are comparing to. I have updated the example in the wiki to reflect this change in interpretation. 4) If you DON'T provide any samples (-sn, -se or -sf), SelectVariants works with all samples from the VCF and ignores sample/genotype information when doing concordance or discordance. That is, it will report every "missing line" or "concordant line" in the two vcfs, regardless of sample or genotype information. 5) When samples are provided (-sn, -se or -sf) discordance and concordance will go down to the genotypes to determine whether or not you have a discordance/concordance event. In this case, a concordance happens only when the two VCFs display the same sample/genotype information for that locus, and discordance happens when the disc track is missing the line or has a different genotype information for that sample. 6) When dealing with multiple samples, concordance only happens if ALL your samples agree, and discordance happens if AT LEAST ONE of your samples disagree. --- Integration tests: 1) Discordance and concordance test added 2) All other tests updated to comply with the new 'sorted output' format and different inputs for samples. --- Methods for handling sample expressions and files with list of samples were added to SampleUtils. I recommend *NOT USING* the old getSamplesFromCommandLineInput as this mixing of sample names with expressions and files creates a rogue error that can be challenging to catch. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6072 348d0f76-0448-11de-a6fe-93d51630548a
2011-06-24 04:18:45 +08:00
/**
* Given a collection of samples and a collection of regular expressions, generates the set of samples that match each expression
* @param originalSamples list of samples to select samples from
* @param sampleExpressions list of expressions to use for matching samples
* @return the set of samples from originalSamples that satisfy at least one of the expressions in sampleExpressions
*/
public static Collection<String> matchSamplesExpressions (Collection<String> originalSamples, Collection<String> sampleExpressions) {
// Now, check the expressions that weren't used in the previous step, and use them as if they're regular expressions
Set<String> samples = new HashSet<String>();
if (sampleExpressions != null) {
for (String expression : sampleExpressions) {
Pattern p = Pattern.compile(expression);
for (String originalSample : originalSamples) {
Matcher m = p.matcher(originalSample);
if (m.find()) {
samples.add(originalSample);
}
}
}
}
return samples;
}
/**
* Given a list of files with sample names it reads all files and creates a list of unique samples from all these files.
* @param files list of files with sample names in
* @return a collection of unique samples from all files
*/
public static Collection<String> getSamplesFromFiles (Collection<File> files) {
Set<String> samplesFromFiles = new HashSet<String>();
if (files != null) {
for (File file : files) {
try {
XReadLines reader = new XReadLines(file);
List<String> lines = reader.readLines();
for (String line : lines) {
samplesFromFiles.add(line);
}
} catch (FileNotFoundException e) {
// ignore exception
}
}
}
return samplesFromFiles;
}
}