Significant memory improvements to plink code

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3144 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-04-09 16:12:38 +00:00
parent 75c1987a18
commit e73e6a4fb0
2 changed files with 165 additions and 154 deletions

View File

@ -114,7 +114,13 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
} }
public Map<String, List<byte[]>> getGenotypes() { /* Get the mapping from sample name to genotypes (array of Alleles).
* Important note: none of the Alleles returned here are annotated as reference
* (since the rod doesn't know offhand what the reference allele is).
*
* @return mapping from sample name to genotype
*/
public Map<String, Allele[]> getGenotypes() {
return currentVariant.getGenotypes(); return currentVariant.getGenotypes();
} }
@ -162,17 +168,15 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
private ArrayList<PlinkVariantInfo> parseTextFormattedPlinkFile( File file ) { private ArrayList<PlinkVariantInfo> parseTextFormattedPlinkFile( File file ) {
try { try {
BufferedReader reader = new BufferedReader( new FileReader ( file ) ); BufferedReader reader = new BufferedReader( new FileReader ( file ) );
String header = reader.readLine(); ArrayList<PlinkVariantInfo> seqVars = new ArrayList<PlinkVariantInfo>();
ArrayList<PlinkVariantInfo> seqVars = instantiateVariantListFromHeader(header); int headerFieldCount = instantiateVariantListFromHeader(seqVars, reader.readLine());
ArrayList<Integer> snpOffsets = getSNPOffsetsFromHeader(header);
sampleNames = new ArrayList<String>(); sampleNames = new ArrayList<String>();
String line; String line;
long counter = 0;
do { do {
line = reader.readLine(); line = reader.readLine();
incorporateInfo(seqVars,snpOffsets,line); incorporateInfo(seqVars, line, headerFieldCount);
} while ( line != null ); } while ( line != null );
@ -188,27 +192,27 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
} }
} }
private void incorporateInfo(List<PlinkVariantInfo> vars, List<Integer> offsets, String plinkLine) { private void incorporateInfo(List<PlinkVariantInfo> vars, String plinkLine, int headerFieldCount) {
if ( plinkLine == null ) { if ( plinkLine == null ) {
return; return;
} }
String[] plinkInfo;
if ( plinkFileType != PlinkFileType.STANDARD_PED ) if ( plinkFileType != PlinkFileType.STANDARD_PED )
throw new StingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file."); throw new StingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file.");
plinkInfo = plinkLine.split("\t"); StringTokenizer st = new StringTokenizer(plinkLine, "\t");
String individualName = plinkInfo[1]; st.nextToken(); // family ID
sampleNames.add(individualName); sampleNames.add(st.nextToken());
for (int i = 2; i < headerFieldCount; i++)
st.nextToken();
int snpNumber = 0; int snpNumber = 0;
for ( int i : offsets ) { while ( snpNumber < vars.size() ) {
vars.get(snpNumber).addGenotypeEntry(plinkInfo[i].split("\\s+"), individualName); vars.get(snpNumber++).addGenotypeEntry(st.nextToken().split("\\s+"));
snpNumber++;
} }
} }
private ArrayList<PlinkVariantInfo> instantiateVariantListFromHeader(String header) { private int instantiateVariantListFromHeader(ArrayList<PlinkVariantInfo> seqVars, String header) {
// if the first line is not a comment (what we're used to seeing), // if the first line is not a comment (what we're used to seeing),
// then it's the raw header (comes from de-binary-ing a .bed file) // then it's the raw header (comes from de-binary-ing a .bed file)
if ( !header.startsWith("#") ) if ( !header.startsWith("#") )
@ -216,38 +220,18 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
plinkFileType = PlinkFileType.STANDARD_PED; plinkFileType = PlinkFileType.STANDARD_PED;
ArrayList<PlinkVariantInfo> seqVars = new ArrayList<PlinkVariantInfo>();
String[] headerFields = header.split("\t"); String[] headerFields = header.split("\t");
int skippedFields = 0;
for ( String field : headerFields ) { for ( String field : headerFields ) {
if ( ! headerEntries.contains(field) ) { if ( headerEntries.contains(field) )
// not a standard header, so a variant skippedFields++;
seqVars.add(new PlinkVariantInfo(field)); else
} // not a standard header, so a variant
seqVars.add(new PlinkVariantInfo(field));
} }
return seqVars; return skippedFields;
}
private ArrayList<Integer> getSNPOffsetsFromHeader(String header) {
ArrayList<Integer> offsets = new ArrayList<Integer>();
String[] headerFields;
if ( plinkFileType == PlinkFileType.STANDARD_PED ) {
headerFields = header.split("\t+");
} else {
headerFields = header.split("\\s+");
}
int offset = 0;
for ( String field : headerFields ) {
if ( ! headerEntries.contains(field) ) {
offsets.add(offset);
}
offset++;
}
return offsets;
} }
/* *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** /* *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
@ -396,7 +380,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
// four genotypes encoded in this byte // four genotypes encoded in this byte
int[] genotypes = parseGenotypes(genotype); int[] genotypes = parseGenotypes(genotype);
for ( int g : genotypes ) { for ( int g : genotypes ) {
variants.get(snpOffset).addBinaryGenotypeEntry(g,sampleNames.get(sampleOffset)); variants.get(snpOffset).addBinaryGenotypeEntry(g);
if ( major ) { if ( major ) {
sampleOffset++; sampleOffset++;
@ -420,135 +404,159 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
genotypes[3] = ( ( genotype & 192 ) >>> 6 ); genotypes[3] = ( ( genotype & 192 ) >>> 6 );
return genotypes; return genotypes;
} }
}
class PlinkVariantInfo implements Comparable { class PlinkVariantInfo implements Comparable {
private String variantName; private String variantName;
private GenomeLoc loc; private GenomeLoc loc;
private Map<String, List<byte[]>> genotypes = new HashMap<String, List<byte[]>>();
// for indels // the list of genotypes in the same order as in sampleNames (using a map here is inefficient)
private boolean isIndel = false; private List<Allele[]> genotypes = new ArrayList<Allele[]>();
private boolean isInsertion = false;
private int length = 1;
// for binary parsing // map of Alleles seen (so that we can share Allele objects among samples)
private String locAllele1; HashMap<String, Allele> alleles = new HashMap<String, Allele>(4);
private String locAllele2;
// for indels
private boolean isIndel = false;
private boolean isInsertion = false;
private int length = 1;
// for binary parsing
private String locAllele1;
private String locAllele2;
public PlinkVariantInfo(String variantName) { public PlinkVariantInfo(String variantName) {
this.variantName = variantName; this.variantName = variantName;
parseName(); parseName();
} }
public GenomeLoc getLocation() { public GenomeLoc getLocation() {
return loc; return loc;
} }
public String getName() { public String getName() {
return variantName; return variantName;
} }
public Map<String, List<byte[]>> getGenotypes() { public Map<String, Allele[]> getGenotypes() {
return genotypes; Map<String, Allele[]> genotypeMap = new HashMap<String, Allele[]>();
} int index = 0;
for ( Allele[] myAlleles : genotypes )
genotypeMap.put(sampleNames.get(index++), myAlleles);
return genotypeMap;
}
public boolean isIndel() { public boolean isIndel() {
return isIndel; return isIndel;
} }
public boolean isInsertion() { public boolean isInsertion() {
return isInsertion; return isInsertion;
} }
public int getLength() { public int getLength() {
return length; return length;
} }
public void setGenomeLoc(GenomeLoc loc) { public void setGenomeLoc(GenomeLoc loc) {
this.loc = loc; this.loc = loc;
} }
private void parseName() { private void parseName() {
int chromIdx = variantName.indexOf("|c"); int chromIdx = variantName.indexOf("|c");
if ( chromIdx == -1 ) if ( chromIdx == -1 )
throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c...)"); throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c...)");
String[] pieces = variantName.substring(chromIdx+2).split("_"); String[] pieces = variantName.substring(chromIdx+2).split("_");
if ( pieces.length < 2 ) if ( pieces.length < 2 )
throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p...)"); throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p...)");
String chrom = pieces[0].trim(); String chrom = pieces[0].trim();
if ( pieces[1].charAt(0) != 'p' ) if ( pieces[1].charAt(0) != 'p' )
throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p...)"); throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p...)");
String pos = pieces[1].substring(1).trim(); String pos = pieces[1].substring(1).trim();
loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos); loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos);
if ( pieces.length > 2 && (pieces[2].startsWith("gI") || pieces[2].startsWith("gD")) ) { if ( pieces.length > 2 && (pieces[2].startsWith("gI") || pieces[2].startsWith("gD")) ) {
// it's an indel // it's an indel
isIndel = true; isIndel = true;
isInsertion = pieces[2].startsWith("gI"); isInsertion = pieces[2].startsWith("gI");
try { try {
// length of insertion on reference is still 1 // length of insertion on reference is still 1
if ( !isInsertion ) if ( !isInsertion )
length = Integer.parseInt(pieces[2].substring(2)); length = Integer.parseInt(pieces[2].substring(2));
} catch (NumberFormatException e) { } catch (NumberFormatException e) {
throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p..._g[I/D][length])"); throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p..._g[I/D][length])");
}
} }
} }
}
public void setAlleles(String al1, String al2) { public void setAlleles(String al1, String al2) {
if ( al1.equals(PlinkRod.SEQUENOM_NO_CALL) ) { if ( al1.equals(PlinkRod.SEQUENOM_NO_CALL) ) {
// encoding for a site at which no variants were detected // encoding for a site at which no variants were detected
locAllele1 = al2; locAllele1 = al2;
} else { } else {
locAllele1 = al1; locAllele1 = al1;
} }
locAllele2 = al2; locAllele2 = al2;
}
public void addGenotypeEntry(String[] alleleStrings, String sampleName) {
ArrayList<byte[]> alleles = new ArrayList<byte[]>(2);
for ( String alleleString : alleleStrings ) {
if ( alleleString.equals(PlinkRod.SEQUENOM_NO_CALL) )
alleles.add(Allele.NO_CALL_STRING.getBytes());
else
alleles.add(alleleString.getBytes());
} }
genotypes.put(sampleName, alleles); public void addGenotypeEntry(String[] alleleStrings) {
}
public void addBinaryGenotypeEntry( int genoTYPE, String sampleName ) { Allele[] myAlleles = new Allele[2];
String[] alleleStr = new String[2];
if ( genoTYPE == 0 ) { for (int i = 0; i < 2; i++) {
alleleStr[0] = locAllele1; if ( alleleStrings.length <= i ) {
alleleStr[1] = locAllele1; myAlleles[i] = null;
} else if (genoTYPE == 2) { continue;
alleleStr[0] = locAllele1; }
alleleStr[1] = locAllele2;
} else if (genoTYPE == 3 ) { String alleleString = alleleStrings[i];
alleleStr[0] = locAllele2;
alleleStr[1] = locAllele2; Allele allele;
} else { if ( alleles.containsKey(alleleString) ) {
alleleStr[0] = "0"; allele = alleles.get(alleleString);
alleleStr[1] = "0"; } else {
if ( PlinkRod.SEQUENOM_NO_CALL.equals(alleleString) )
allele = Allele.NO_CALL;
else
allele = new Allele(alleleString);
alleles.put(alleleString, allele);
}
myAlleles[i] = allele;
}
genotypes.add(myAlleles);
} }
addGenotypeEntry(alleleStr, sampleName); public void addBinaryGenotypeEntry(int genoTYPE) {
} String[] alleleStr = new String[2];
if ( genoTYPE == 0 ) {
alleleStr[0] = locAllele1;
alleleStr[1] = locAllele1;
} else if (genoTYPE == 2) {
alleleStr[0] = locAllele1;
alleleStr[1] = locAllele2;
} else if (genoTYPE == 3 ) {
alleleStr[0] = locAllele2;
alleleStr[1] = locAllele2;
} else {
alleleStr[0] = "0";
alleleStr[1] = "0";
}
public int compareTo(Object obj) { addGenotypeEntry(alleleStr);
if ( ! ( obj instanceof PlinkVariantInfo) ) {
return 1;
} }
return loc.compareTo(((PlinkVariantInfo) obj).getLocation()); public int compareTo(Object obj) {
if ( ! ( obj instanceof PlinkVariantInfo) ) {
return 1;
}
return loc.compareTo(((PlinkVariantInfo) obj).getLocation());
}
} }
} }

View File

@ -391,23 +391,26 @@ public class VariantContextAdaptors {
Set<Genotype> genotypes = new HashSet<Genotype>(); Set<Genotype> genotypes = new HashSet<Genotype>();
Map<String, List<byte[]>> genotypeSets = plink.getGenotypes(); Map<String, Allele[]> genotypeSets = plink.getGenotypes();
// for each sample
for ( Map.Entry<String, List<byte[]>> genotype : genotypeSets.entrySet() ) { // We need to iterate through this list and recreate the Alleles since the
// PlinkRod does not promise to annotate any of the Alleles as reference
// for each sample...
for ( Map.Entry<String, Allele[]> genotype : genotypeSets.entrySet() ) {
ArrayList<Allele> myAlleles = new ArrayList<Allele>(2); ArrayList<Allele> myAlleles = new ArrayList<Allele>(2);
// for each allele // for each allele...
for ( byte[] alleleString : genotype.getValue() ) { for ( Allele myAllele : genotype.getValue() ) {
Allele allele; Allele allele;
if ( Allele.wouldBeNoCallAllele(alleleString) ) { if ( myAllele.isNoCall() ) {
allele = Allele.NO_CALL; allele = Allele.NO_CALL;
} else { } else {
if ( !plink.isIndel() ) { if ( !plink.isIndel() ) {
allele = new Allele(alleleString, refAllele.basesMatch(alleleString)); allele = new Allele(myAllele.getBases(), refAllele.equals(myAllele, true));
} else if ( Allele.wouldBeNullAllele(alleleString) ) { } else if ( myAllele.isNull() ) {
allele = new Allele(alleleString, plink.isInsertion()); allele = new Allele(Allele.NULL_ALLELE_STRING, plink.isInsertion());
} else { } else {
allele = new Allele(alleleString, !plink.isInsertion()); allele = new Allele(myAllele.getBases(), !plink.isInsertion());
} }
alleles.add(allele); alleles.add(allele);