2011-07-18 22:46:01 +08:00
/ *
2013-01-11 06:04:08 +08:00
* Copyright ( c ) 2012 The Broad Institute
*
* Permission is hereby granted , free of charge , to any person
* obtaining a copy of this software and associated documentation
* files ( the "Software" ) , to deal in the Software without
* restriction , including without limitation the rights to use ,
* copy , modify , merge , publish , distribute , sublicense , and / or sell
* copies of the Software , and to permit persons to whom the
* Software is furnished to do so , subject to the following
* conditions :
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software .
*
* THE SOFTWARE IS PROVIDED "AS IS" , WITHOUT WARRANTY OF ANY KIND ,
* EXPRESS OR IMPLIED , INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY , FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT . IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM , DAMAGES OR OTHER LIABILITY ,
* WHETHER IN AN ACTION OF CONTRACT , TORT OR OTHERWISE , ARISING
* FROM , OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE .
* /
2011-07-18 22:46:01 +08:00
package org.broadinstitute.sting ;
import org.apache.commons.io.FileUtils ;
2012-06-07 02:02:01 +08:00
import org.apache.log4j.Logger ;
2011-07-18 22:46:01 +08:00
import org.broadinstitute.sting.gatk.walkers.diffengine.DiffEngine ;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException ;
import java.io.* ;
import java.math.BigInteger ;
import java.security.MessageDigest ;
import java.util.Arrays ;
/ * *
* Created by IntelliJ IDEA .
* User : depristo
* Date : 7 / 18 / 11
* Time : 9 : 10 AM
*
* Utilities for manipulating the MD5 database of previous results
* /
public class MD5DB {
2012-06-07 02:02:01 +08:00
public static final Logger logger = Logger . getLogger ( MD5DB . class ) ;
2011-07-18 22:46:01 +08:00
/ * *
* Subdirectory under the ant build directory where we store integration test md5 results
* /
2012-06-19 21:46:26 +08:00
private static final int MAX_RECORDS_TO_READ = 1000000 ;
Algorithmically faster version of DiffEngine
-- Now only includes leaf nodes in the summary, i.e., summaries of the form "*.*....*.X", which are really the most valuable to see. This calculation can be accomplished in linear time for N differences, rather than the previous O(n^2) algorithm
-- Now computes the max number of elements to read correctly. Counts now the size of the entire element tree, not just the count of the roots, which was painful because the trees vary by orders of magnitude in size.
-- Because of this we can enforce a meaningful, useful value for the max elements in MD5 or 100K, and this works well.
-- Added integration test for new leaf and old pairwise calculations
-- Bugfix for Utils.join(sep, int[]) that was eating the first element of the AD, PL fields
2012-06-11 08:13:18 +08:00
private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = - 1 ;
2011-07-18 22:46:01 +08:00
public static final String LOCAL_MD5_DB_DIR = "integrationtests" ;
public static final String GLOBAL_MD5_DB_DIR = "/humgen/gsa-hpprojects/GATK/data/integrationtests" ;
2012-06-07 02:02:01 +08:00
// tracking and emitting a data file of origina and new md5s
private final File MD5MismatchesFile ;
private final PrintStream md5MismatchStream ;
public MD5DB ( ) {
this ( new File ( MD5DB . LOCAL_MD5_DB_DIR + "/md5mismatches.txt" ) ) ;
}
public MD5DB ( final File MD5MismatchesFile ) {
this . MD5MismatchesFile = MD5MismatchesFile ;
ensureMd5DbDirectory ( ) ;
logger . debug ( "Creating md5 mismatch db at " + MD5MismatchesFile ) ;
try {
md5MismatchStream = new PrintStream ( new FileOutputStream ( MD5MismatchesFile ) ) ;
md5MismatchStream . printf ( "%s\t%s\t%s%n" , "expected" , "observed" , "test" ) ;
} catch ( FileNotFoundException e ) {
throw new ReviewedStingException ( "Failed to open md5 mismatch file" , e ) ;
}
}
public void close ( ) {
if ( md5MismatchStream ! = null ) {
logger . debug ( "Closeing md5 mismatch db at " + MD5MismatchesFile ) ;
md5MismatchStream . close ( ) ;
}
}
2011-07-18 22:46:01 +08:00
// ----------------------------------------------------------------------
//
// MD5 DB stuff
//
// ----------------------------------------------------------------------
/ * *
* Create the MD5 file directories if necessary
* /
2012-06-07 02:02:01 +08:00
private void ensureMd5DbDirectory ( ) {
2011-07-18 22:46:01 +08:00
File dir = new File ( LOCAL_MD5_DB_DIR ) ;
if ( ! dir . exists ( ) ) {
System . out . printf ( "##### Creating MD5 db %s%n" , LOCAL_MD5_DB_DIR ) ;
if ( ! dir . mkdir ( ) ) {
throw new ReviewedStingException ( "Infrastructure failure: failed to create md5 directory " + LOCAL_MD5_DB_DIR ) ;
}
}
}
/ * *
* Returns the path to an already existing file with the md5 contents , or valueIfNotFound
* if no such file exists in the db .
*
* @param md5
* @param valueIfNotFound
* @return
* /
2012-06-07 02:02:01 +08:00
public String getMD5FilePath ( final String md5 , final String valueIfNotFound ) {
2011-07-27 05:35:30 +08:00
// we prefer the global db to the local DB, so match it first
for ( String dir : Arrays . asList ( GLOBAL_MD5_DB_DIR , LOCAL_MD5_DB_DIR ) ) {
2011-07-18 22:46:01 +08:00
File f = getFileForMD5 ( md5 , dir ) ;
if ( f . exists ( ) & & f . canRead ( ) )
return f . getPath ( ) ;
}
return valueIfNotFound ;
}
/ * *
* Utility function that given a file ' s md5 value and the path to the md5 db ,
* returns the canonical name of the file . For example , if md5 is XXX and db is YYY ,
* this will return YYY / XXX . integrationtest
*
* @param md5
* @param dbPath
* @return
* /
2012-06-07 02:02:01 +08:00
private File getFileForMD5 ( final String md5 , final String dbPath ) {
2011-07-18 22:46:01 +08:00
final String basename = String . format ( "%s.integrationtest" , md5 ) ;
return new File ( dbPath + "/" + basename ) ;
}
/ * *
* Copies the results file with md5 value to its canonical file name and db places
*
* @param md5
* @param resultsFile
* /
2012-06-07 02:02:01 +08:00
private void updateMD5Db ( final String md5 , final File resultsFile ) {
2011-07-18 22:46:01 +08:00
copyFileToDB ( getFileForMD5 ( md5 , LOCAL_MD5_DB_DIR ) , resultsFile ) ;
copyFileToDB ( getFileForMD5 ( md5 , GLOBAL_MD5_DB_DIR ) , resultsFile ) ;
}
/ * *
* Low - level utility routine that copies resultsFile to dbFile
* @param dbFile
* @param resultsFile
* /
2012-06-07 02:02:01 +08:00
private void copyFileToDB ( File dbFile , final File resultsFile ) {
2011-07-18 22:46:01 +08:00
if ( ! dbFile . exists ( ) ) {
// the file isn't already in the db, copy it over
System . out . printf ( "##### Updating MD5 file: %s%n" , dbFile . getPath ( ) ) ;
try {
FileUtils . copyFile ( resultsFile , dbFile ) ;
} catch ( IOException e ) {
System . out . printf ( "##### Skipping update, cannot write file %s%n" , dbFile ) ;
}
} else {
2011-10-05 06:53:52 +08:00
//System.out.printf("##### MD5 file is up to date: %s%n", dbFile.getPath());
2011-07-18 22:46:01 +08:00
}
}
/ * *
* Returns the byte [ ] of the entire contents of file , for md5 calculations
* @param file
* @return
* @throws IOException
* /
private static byte [ ] getBytesFromFile ( File file ) throws IOException {
InputStream is = new FileInputStream ( file ) ;
// Get the size of the file
long length = file . length ( ) ;
if ( length > Integer . MAX_VALUE ) {
// File is too large
}
// Create the byte array to hold the data
byte [ ] bytes = new byte [ ( int ) length ] ;
// Read in the bytes
int offset = 0 ;
int numRead = 0 ;
while ( offset < bytes . length
& & ( numRead = is . read ( bytes , offset , bytes . length - offset ) ) > = 0 ) {
offset + = numRead ;
}
// Ensure all the bytes have been read in
if ( offset < bytes . length ) {
throw new IOException ( "Could not completely read file " + file . getName ( ) ) ;
}
// Close the input stream and return bytes
is . close ( ) ;
return bytes ;
}
2011-10-05 06:53:52 +08:00
public static class MD5Match {
2012-05-24 22:50:33 +08:00
final String actualMD5 , expectedMD5 ;
2011-10-05 06:53:52 +08:00
final String failMessage ;
boolean failed ;
2012-05-24 22:50:33 +08:00
public MD5Match ( final String actualMD5 , final String expectedMD5 , final String failMessage , final boolean failed ) {
this . actualMD5 = actualMD5 ;
this . expectedMD5 = expectedMD5 ;
2011-10-05 06:53:52 +08:00
this . failMessage = failMessage ;
this . failed = failed ;
}
}
2011-07-18 22:46:01 +08:00
/ * *
* Tests a file MD5 against an expected value , returning the MD5 . NOTE : This function WILL throw an exception if the MD5s are different .
* @param name Name of the test .
* @param resultsFile File to MD5 .
* @param expectedMD5 Expected MD5 value .
* @param parameterize If true or if expectedMD5 is an empty string , will print out the calculated MD5 instead of error text .
* @return The calculated MD5 .
* /
2012-06-07 02:02:01 +08:00
public MD5Match assertMatchingMD5 ( final String name , final File resultsFile , final String expectedMD5 , final boolean parameterize ) {
2012-05-24 22:50:33 +08:00
final String actualMD5 = testFileMD5 ( name , resultsFile , expectedMD5 , parameterize ) ;
2011-10-05 06:53:52 +08:00
String failMessage = null ;
boolean failed = false ;
2011-07-18 22:46:01 +08:00
if ( parameterize | | expectedMD5 . equals ( "" ) ) {
// Don't assert
2012-05-24 22:50:33 +08:00
} else if ( actualMD5 . equals ( expectedMD5 ) ) {
//BaseTest.log(String.format(" => %s PASSED (expected=%s)", name, expectedMD5));
2011-07-18 22:46:01 +08:00
} else {
2011-10-05 06:53:52 +08:00
failed = true ;
2012-05-24 22:50:33 +08:00
failMessage = String . format ( "%s has mismatching MD5s: expected=%s observed=%s" , name , expectedMD5 , actualMD5 ) ;
2011-07-18 22:46:01 +08:00
}
2012-05-24 22:50:33 +08:00
return new MD5Match ( actualMD5 , expectedMD5 , failMessage , failed ) ;
2011-07-18 22:46:01 +08:00
}
/ * *
* Tests a file MD5 against an expected value , returning the MD5 . NOTE : This function WILL NOT throw an exception if the MD5s are different .
* @param name Name of the test .
* @param resultsFile File to MD5 .
* @param expectedMD5 Expected MD5 value .
* @param parameterize If true or if expectedMD5 is an empty string , will print out the calculated MD5 instead of error text .
* @return The calculated MD5 .
* /
2012-06-07 02:02:01 +08:00
public String testFileMD5 ( final String name , final File resultsFile , final String expectedMD5 , final boolean parameterize ) {
2011-07-18 22:46:01 +08:00
try {
byte [ ] bytesOfMessage = getBytesFromFile ( resultsFile ) ;
byte [ ] thedigest = MessageDigest . getInstance ( "MD5" ) . digest ( bytesOfMessage ) ;
BigInteger bigInt = new BigInteger ( 1 , thedigest ) ;
String filemd5sum = bigInt . toString ( 16 ) ;
while ( filemd5sum . length ( ) < 32 ) filemd5sum = "0" + filemd5sum ; // pad to length 32
//
// copy md5 to integrationtests
//
updateMD5Db ( filemd5sum , resultsFile ) ;
if ( parameterize | | expectedMD5 . equals ( "" ) ) {
2012-05-24 22:50:33 +08:00
BaseTest . log ( String . format ( "PARAMETERIZATION: file %s has md5 = %s" , resultsFile , filemd5sum ) ) ;
2011-07-18 22:46:01 +08:00
} else {
2011-10-05 06:53:52 +08:00
//System.out.println(String.format("Checking MD5 for %s [calculated=%s, expected=%s]", resultsFile, filemd5sum, expectedMD5));
//System.out.flush();
2011-07-18 22:46:01 +08:00
if ( ! expectedMD5 . equals ( filemd5sum ) ) {
// we are going to fail for real in assertEquals (so we are counted by the testing framework).
// prepare ourselves for the comparison
2011-10-14 06:01:51 +08:00
System . out . printf ( "##### Test %s is going to fail #####%n" , name ) ;
2011-07-18 22:46:01 +08:00
String pathToExpectedMD5File = getMD5FilePath ( expectedMD5 , "[No DB file found]" ) ;
String pathToFileMD5File = getMD5FilePath ( filemd5sum , "[No DB file found]" ) ;
2012-05-29 08:20:05 +08:00
BaseTest . log ( String . format ( "expected %s" , expectedMD5 ) ) ;
BaseTest . log ( String . format ( "calculated %s" , filemd5sum ) ) ;
2012-05-24 22:50:33 +08:00
BaseTest . log ( String . format ( "diff %s %s" , pathToExpectedMD5File , pathToFileMD5File ) ) ;
2011-07-18 22:46:01 +08:00
2012-06-07 02:02:01 +08:00
md5MismatchStream . printf ( "%s\t%s\t%s%n" , expectedMD5 , filemd5sum , name ) ;
md5MismatchStream . flush ( ) ;
2011-07-18 22:46:01 +08:00
// inline differences
2012-05-24 22:50:33 +08:00
final ByteArrayOutputStream baos = new ByteArrayOutputStream ( ) ;
final PrintStream ps = new PrintStream ( baos ) ;
Algorithmically faster version of DiffEngine
-- Now only includes leaf nodes in the summary, i.e., summaries of the form "*.*....*.X", which are really the most valuable to see. This calculation can be accomplished in linear time for N differences, rather than the previous O(n^2) algorithm
-- Now computes the max number of elements to read correctly. Counts now the size of the entire element tree, not just the count of the roots, which was painful because the trees vary by orders of magnitude in size.
-- Because of this we can enforce a meaningful, useful value for the max elements in MD5 or 100K, and this works well.
-- Added integration test for new leaf and old pairwise calculations
-- Bugfix for Utils.join(sep, int[]) that was eating the first element of the AD, PL fields
2012-06-11 08:13:18 +08:00
DiffEngine . SummaryReportParams params = new DiffEngine . SummaryReportParams ( ps , 20 , 10 , 0 , MAX_RAW_DIFFS_TO_SUMMARIZE , false ) ;
2011-07-26 02:58:31 +08:00
boolean success = DiffEngine . simpleDiffFiles ( new File ( pathToExpectedMD5File ) , new File ( pathToFileMD5File ) , MAX_RECORDS_TO_READ , params ) ;
2012-05-24 22:50:33 +08:00
if ( success ) {
final String content = baos . toString ( ) ;
BaseTest . log ( content ) ;
2011-07-18 22:46:01 +08:00
System . out . printf ( "Note that the above list is not comprehensive. At most 20 lines of output, and 10 specific differences will be listed. Please use -T DiffObjects -R public/testdata/exampleFASTA.fasta -m %s -t %s to explore the differences more freely%n" ,
pathToExpectedMD5File , pathToFileMD5File ) ;
2012-05-24 22:50:33 +08:00
}
ps . close ( ) ;
2011-07-18 22:46:01 +08:00
}
}
return filemd5sum ;
} catch ( Exception e ) {
throw new RuntimeException ( "Failed to read bytes from calls file: " + resultsFile , e ) ;
}
}
}