package org.broadinstitute.sting.utils; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMReadGroupRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RodVCF; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.utils.genotype.vcf.VCFReader; import java.util.*; /** * SampleUtils is a static class (no instantiation allowed!) with some utility methods for getting samples * quality scores. * * @author ebanks */ public class SampleUtils { /** * Private constructor. No instantiating this class! */ private SampleUtils() {} /** * Pull out the samples from a SAMFileHeader; * note that we use a TreeSet so that they are sorted * * @param header the sam file header * @return list of strings representing the sample names */ public static Set getSAMFileSamples(SAMFileHeader header) { // get all of the unique sample names Set samples = new TreeSet(); List readGroups = header.getReadGroups(); for ( SAMReadGroupRecord readGroup : readGroups ) samples.add(readGroup.getSample()); return samples; } /** * Gets all of the unique sample names from all VCF rods input by the user * * @param toolkit GATK engine * * @return the set of unique samples */ public static Set getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit) { Set samples = new TreeSet(); // iterate to get all of the sample names List dataSources = toolkit.getRodDataSources(); for ( ReferenceOrderedDataSource source : dataSources ) { RMDTrack rod = source.getReferenceOrderedData(); if ( rod.getType().equals(RodVCF.class) ) { VCFReader reader = new VCFReader(rod.getFile()); samples.addAll(reader.getHeader().getGenotypeSamples()); reader.close(); } } return samples; } /** * Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap * (e.g. sampleX.1, sampleX.2, ...) * When finished, samples contains the uniquified sample names and rodNamesToSampleNames contains a mapping * from rod/sample pairs to the new uniquified names * * @param toolkit GATK engine * @param samples set to store the sample names * @param rodNamesToSampleNames mapping of rod/sample pairs to new uniquified sample names */ public static void getUniquifiedSamplesFromRods(GenomeAnalysisEngine toolkit, Set samples, Map, String> rodNamesToSampleNames) { // keep a map of sample name to occurrences encountered HashMap sampleOverlapMap = new HashMap(); // iterate to get all of the sample names List dataSources = toolkit.getRodDataSources(); for ( ReferenceOrderedDataSource source : dataSources ) { RMDTrack rod = source.getReferenceOrderedData(); if ( rod.getType().equals(RodVCF.class) ) { VCFReader reader = new VCFReader(rod.getFile()); Set vcfSamples = reader.getHeader().getGenotypeSamples(); for ( String sample : vcfSamples ) addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, rod.getName()); reader.close(); } } } private static void addUniqueSample(Set samples, Map sampleOverlapMap, Map, String> rodNamesToSampleNames, String newSample, String rodName) { // how many occurrences have we seen so far? Integer occurrences = sampleOverlapMap.get(newSample); // if this is the first one, just add it to the list of samples if ( occurrences == null ) { samples.add(newSample); rodNamesToSampleNames.put(new Pair(rodName, newSample), newSample); sampleOverlapMap.put(newSample, 1); } // if it's already been seen multiple times, give it a unique suffix and increment the value else if ( occurrences >= 2 ) { String uniqueName = newSample + "." + rodName; samples.add(uniqueName); rodNamesToSampleNames.put(new Pair(rodName, newSample), uniqueName); sampleOverlapMap.put(newSample, occurrences + 1); } // if this is the second occurrence of the sample name, uniquify both of them else { // occurrences == 2 // remove the 1st occurrence, uniquify it, and add it back samples.remove(newSample); String uniqueName1 = null; for ( Map.Entry, String> entry : rodNamesToSampleNames.entrySet() ) { if ( entry.getValue().equals(newSample) ) { uniqueName1 = newSample + "." + entry.getKey().first; entry.setValue(uniqueName1); break; } } samples.add(uniqueName1); // add the second one String uniqueName2 = newSample + "." + rodName; samples.add(uniqueName2); rodNamesToSampleNames.put(new Pair(rodName, newSample), uniqueName2); sampleOverlapMap.put(newSample, 2); } } }