From e73e6a4fb071fd87e6073f3e1443e5c55dcdafed Mon Sep 17 00:00:00 2001 From: ebanks Date: Fri, 9 Apr 2010 16:12:38 +0000 Subject: [PATCH] Significant memory improvements to plink code git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3144 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/refdata/PlinkRod.java | 296 +++++++++--------- .../gatk/refdata/VariantContextAdaptors.java | 23 +- 2 files changed, 165 insertions(+), 154 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRod.java b/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRod.java index 143ae8e87..49938141e 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRod.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRod.java @@ -114,7 +114,13 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator> getGenotypes() { + /* Get the mapping from sample name to genotypes (array of Alleles). + * Important note: none of the Alleles returned here are annotated as reference + * (since the rod doesn't know offhand what the reference allele is). + * + * @return mapping from sample name to genotype + */ + public Map getGenotypes() { return currentVariant.getGenotypes(); } @@ -162,17 +168,15 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator parseTextFormattedPlinkFile( File file ) { try { BufferedReader reader = new BufferedReader( new FileReader ( file ) ); - String header = reader.readLine(); - ArrayList seqVars = instantiateVariantListFromHeader(header); - ArrayList snpOffsets = getSNPOffsetsFromHeader(header); + ArrayList seqVars = new ArrayList(); + int headerFieldCount = instantiateVariantListFromHeader(seqVars, reader.readLine()); sampleNames = new ArrayList(); String line; - long counter = 0; do { line = reader.readLine(); - incorporateInfo(seqVars,snpOffsets,line); + incorporateInfo(seqVars, line, headerFieldCount); } while ( line != null ); @@ -188,27 +192,27 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator vars, List offsets, String plinkLine) { + private void incorporateInfo(List vars, String plinkLine, int headerFieldCount) { if ( plinkLine == null ) { return; } - String[] plinkInfo; if ( plinkFileType != PlinkFileType.STANDARD_PED ) throw new StingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file."); - plinkInfo = plinkLine.split("\t"); - String individualName = plinkInfo[1]; - sampleNames.add(individualName); + StringTokenizer st = new StringTokenizer(plinkLine, "\t"); + st.nextToken(); // family ID + sampleNames.add(st.nextToken()); + for (int i = 2; i < headerFieldCount; i++) + st.nextToken(); int snpNumber = 0; - for ( int i : offsets ) { - vars.get(snpNumber).addGenotypeEntry(plinkInfo[i].split("\\s+"), individualName); - snpNumber++; + while ( snpNumber < vars.size() ) { + vars.get(snpNumber++).addGenotypeEntry(st.nextToken().split("\\s+")); } } - private ArrayList instantiateVariantListFromHeader(String header) { + private int instantiateVariantListFromHeader(ArrayList seqVars, String header) { // if the first line is not a comment (what we're used to seeing), // then it's the raw header (comes from de-binary-ing a .bed file) if ( !header.startsWith("#") ) @@ -216,38 +220,18 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator seqVars = new ArrayList(); String[] headerFields = header.split("\t"); + int skippedFields = 0; for ( String field : headerFields ) { - if ( ! headerEntries.contains(field) ) { - // not a standard header, so a variant - seqVars.add(new PlinkVariantInfo(field)); - } + if ( headerEntries.contains(field) ) + skippedFields++; + else + // not a standard header, so a variant + seqVars.add(new PlinkVariantInfo(field)); } - return seqVars; - } - - private ArrayList getSNPOffsetsFromHeader(String header) { - ArrayList offsets = new ArrayList(); - String[] headerFields; - if ( plinkFileType == PlinkFileType.STANDARD_PED ) { - headerFields = header.split("\t+"); - } else { - headerFields = header.split("\\s+"); - } - - int offset = 0; - - for ( String field : headerFields ) { - if ( ! headerEntries.contains(field) ) { - offsets.add(offset); - } - offset++; - } - - return offsets; + return skippedFields; } /* *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** @@ -396,7 +380,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator>> 6 ); return genotypes; } -} -class PlinkVariantInfo implements Comparable { + class PlinkVariantInfo implements Comparable { - private String variantName; - private GenomeLoc loc; - private Map> genotypes = new HashMap>(); + private String variantName; + private GenomeLoc loc; - // for indels - private boolean isIndel = false; - private boolean isInsertion = false; - private int length = 1; + // the list of genotypes in the same order as in sampleNames (using a map here is inefficient) + private List genotypes = new ArrayList(); - // for binary parsing - private String locAllele1; - private String locAllele2; + // map of Alleles seen (so that we can share Allele objects among samples) + HashMap alleles = new HashMap(4); + + // for indels + private boolean isIndel = false; + private boolean isInsertion = false; + private int length = 1; + + // for binary parsing + private String locAllele1; + private String locAllele2; - public PlinkVariantInfo(String variantName) { - this.variantName = variantName; - parseName(); - } + public PlinkVariantInfo(String variantName) { + this.variantName = variantName; + parseName(); + } - public GenomeLoc getLocation() { - return loc; - } + public GenomeLoc getLocation() { + return loc; + } - public String getName() { - return variantName; - } + public String getName() { + return variantName; + } - public Map> getGenotypes() { - return genotypes; - } + public Map getGenotypes() { + Map genotypeMap = new HashMap(); + int index = 0; + for ( Allele[] myAlleles : genotypes ) + genotypeMap.put(sampleNames.get(index++), myAlleles); + return genotypeMap; + } - public boolean isIndel() { - return isIndel; - } + public boolean isIndel() { + return isIndel; + } - public boolean isInsertion() { - return isInsertion; - } + public boolean isInsertion() { + return isInsertion; + } - public int getLength() { - return length; - } + public int getLength() { + return length; + } - public void setGenomeLoc(GenomeLoc loc) { - this.loc = loc; - } + public void setGenomeLoc(GenomeLoc loc) { + this.loc = loc; + } - private void parseName() { - int chromIdx = variantName.indexOf("|c"); - if ( chromIdx == -1 ) - throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c...)"); - String[] pieces = variantName.substring(chromIdx+2).split("_"); - if ( pieces.length < 2 ) - throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p...)"); + private void parseName() { + int chromIdx = variantName.indexOf("|c"); + if ( chromIdx == -1 ) + throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c...)"); + String[] pieces = variantName.substring(chromIdx+2).split("_"); + if ( pieces.length < 2 ) + throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p...)"); - String chrom = pieces[0].trim(); - if ( pieces[1].charAt(0) != 'p' ) - throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p...)"); + String chrom = pieces[0].trim(); + if ( pieces[1].charAt(0) != 'p' ) + throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p...)"); - String pos = pieces[1].substring(1).trim(); - loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos); + String pos = pieces[1].substring(1).trim(); + loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos); - if ( pieces.length > 2 && (pieces[2].startsWith("gI") || pieces[2].startsWith("gD")) ) { - // it's an indel - isIndel = true; - isInsertion = pieces[2].startsWith("gI"); - try { - // length of insertion on reference is still 1 - if ( !isInsertion ) - length = Integer.parseInt(pieces[2].substring(2)); - } catch (NumberFormatException e) { - throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p..._g[I/D][length])"); + if ( pieces.length > 2 && (pieces[2].startsWith("gI") || pieces[2].startsWith("gD")) ) { + // it's an indel + isIndel = true; + isInsertion = pieces[2].startsWith("gI"); + try { + // length of insertion on reference is still 1 + if ( !isInsertion ) + length = Integer.parseInt(pieces[2].substring(2)); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p..._g[I/D][length])"); + } } } - } - public void setAlleles(String al1, String al2) { - if ( al1.equals(PlinkRod.SEQUENOM_NO_CALL) ) { - // encoding for a site at which no variants were detected - locAllele1 = al2; - } else { - locAllele1 = al1; - } - locAllele2 = al2; - } - - public void addGenotypeEntry(String[] alleleStrings, String sampleName) { - - ArrayList alleles = new ArrayList(2); - - for ( String alleleString : alleleStrings ) { - if ( alleleString.equals(PlinkRod.SEQUENOM_NO_CALL) ) - alleles.add(Allele.NO_CALL_STRING.getBytes()); - else - alleles.add(alleleString.getBytes()); + public void setAlleles(String al1, String al2) { + if ( al1.equals(PlinkRod.SEQUENOM_NO_CALL) ) { + // encoding for a site at which no variants were detected + locAllele1 = al2; + } else { + locAllele1 = al1; + } + locAllele2 = al2; } - genotypes.put(sampleName, alleles); - } + public void addGenotypeEntry(String[] alleleStrings) { - public void addBinaryGenotypeEntry( int genoTYPE, String sampleName ) { - String[] alleleStr = new String[2]; - if ( genoTYPE == 0 ) { - alleleStr[0] = locAllele1; - alleleStr[1] = locAllele1; - } else if (genoTYPE == 2) { - alleleStr[0] = locAllele1; - alleleStr[1] = locAllele2; - } else if (genoTYPE == 3 ) { - alleleStr[0] = locAllele2; - alleleStr[1] = locAllele2; - } else { - alleleStr[0] = "0"; - alleleStr[1] = "0"; + Allele[] myAlleles = new Allele[2]; + + for (int i = 0; i < 2; i++) { + if ( alleleStrings.length <= i ) { + myAlleles[i] = null; + continue; + } + + String alleleString = alleleStrings[i]; + + Allele allele; + if ( alleles.containsKey(alleleString) ) { + allele = alleles.get(alleleString); + } else { + if ( PlinkRod.SEQUENOM_NO_CALL.equals(alleleString) ) + allele = Allele.NO_CALL; + else + allele = new Allele(alleleString); + alleles.put(alleleString, allele); + } + + myAlleles[i] = allele; + } + + genotypes.add(myAlleles); } - addGenotypeEntry(alleleStr, sampleName); - } + public void addBinaryGenotypeEntry(int genoTYPE) { + String[] alleleStr = new String[2]; + if ( genoTYPE == 0 ) { + alleleStr[0] = locAllele1; + alleleStr[1] = locAllele1; + } else if (genoTYPE == 2) { + alleleStr[0] = locAllele1; + alleleStr[1] = locAllele2; + } else if (genoTYPE == 3 ) { + alleleStr[0] = locAllele2; + alleleStr[1] = locAllele2; + } else { + alleleStr[0] = "0"; + alleleStr[1] = "0"; + } - public int compareTo(Object obj) { - if ( ! ( obj instanceof PlinkVariantInfo) ) { - return 1; + addGenotypeEntry(alleleStr); } - return loc.compareTo(((PlinkVariantInfo) obj).getLocation()); + public int compareTo(Object obj) { + if ( ! ( obj instanceof PlinkVariantInfo) ) { + return 1; + } + + return loc.compareTo(((PlinkVariantInfo) obj).getLocation()); + } } } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index 9dfc4ef16..3da717f1c 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -391,23 +391,26 @@ public class VariantContextAdaptors { Set genotypes = new HashSet(); - Map> genotypeSets = plink.getGenotypes(); - // for each sample - for ( Map.Entry> genotype : genotypeSets.entrySet() ) { + Map genotypeSets = plink.getGenotypes(); + + // We need to iterate through this list and recreate the Alleles since the + // PlinkRod does not promise to annotate any of the Alleles as reference + // for each sample... + for ( Map.Entry genotype : genotypeSets.entrySet() ) { ArrayList myAlleles = new ArrayList(2); - // for each allele - for ( byte[] alleleString : genotype.getValue() ) { + // for each allele... + for ( Allele myAllele : genotype.getValue() ) { Allele allele; - if ( Allele.wouldBeNoCallAllele(alleleString) ) { + if ( myAllele.isNoCall() ) { allele = Allele.NO_CALL; } else { if ( !plink.isIndel() ) { - allele = new Allele(alleleString, refAllele.basesMatch(alleleString)); - } else if ( Allele.wouldBeNullAllele(alleleString) ) { - allele = new Allele(alleleString, plink.isInsertion()); + allele = new Allele(myAllele.getBases(), refAllele.equals(myAllele, true)); + } else if ( myAllele.isNull() ) { + allele = new Allele(Allele.NULL_ALLELE_STRING, plink.isInsertion()); } else { - allele = new Allele(alleleString, !plink.isInsertion()); + allele = new Allele(myAllele.getBases(), !plink.isInsertion()); } alleles.add(allele);