Skip to content

Commit

Permalink
Added hash map generation test.
Browse files Browse the repository at this point in the history
  • Loading branch information
Hugo Ferreira committed Apr 5, 2017
1 parent 7467be9 commit 96f9ec0
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 27 deletions.
52 changes: 34 additions & 18 deletions src/main/scala/pt/inescn/utils/NABUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -92,23 +92,24 @@ object NABUtils {
}
}

def isData(f: File, ext: String) = {
val r1 = { f.extension(includeDot = false).exists { e => e.equalsIgnoreCase(ext) } }
r1 && (!f.name.toLowerCase.contains("scores"))
}

/**
* Get all the data files
*/
def allDataFiles(dataDir: File = File(data_dir), ext: String = "csv"): Option[List[File]] =
allFiles(dataDir).map { _.filter { _.extension(includeDot = false).exists { e => e.equals(ext) } } }
// allFiles(dataDir).map { _.filter { _.extension(includeDot = false).exists { e => e.equals(ext) } } }
allFiles(dataDir).map { _.filter { isData(_, ext) } }

/**
* Get all the label files
*/
def allLabelFiles(labelDir: File = File(label_dir), ext: String = "json"): Option[List[File]] =
allFiles(labelDir).map { _.filter { _.extension(includeDot = false).exists { e => e.equals(ext) } } }

/*
(root/"tmp"/"diary.txt")
.createIfNotExists()
*/

import org.threeten.extra.Interval

import java.time.Instant
Expand Down Expand Up @@ -513,8 +514,8 @@ object NABUtils {
* @param reward_low_FP_rate: Double - score when we assign greater weight to the low FPs
* @param reward_low_FN_rate: Double - score when we assign greater weight to the low FNs
* @param standard: Double - score when we assign the same weight to the low FNs and FPs
*
* Note that some files have an extra `raw_score`.
*
* Note that some files have an extra `raw_score`.
*/
case class NABXtraResultRow(dt: java.time.Instant, value: Double, anomaly_score: Double, raw_score: Double, label: Int, reward_low_FP_rate: Double, reward_low_FN_rate: Double, standard: Double)
case class NABResultRow(dt: java.time.Instant, value: Double, anomaly_score: Double, label: Int, reward_low_FP_rate: Double, reward_low_FN_rate: Double, standard: Double)
Expand Down Expand Up @@ -569,7 +570,7 @@ object NABUtils {

/**
* Add a `NABResultRow`to the `NABResultAll` in column major form. Note that the some
* result files have an additional raw score. In this case we simple place use an empty
* result files have an additional raw score. In this case we simple place use an empty
* column as a place holder.
*/
def addTo(acc: NABResultAll, e: NABResultRow) =
Expand All @@ -580,14 +581,14 @@ object NABUtils {

/**
* Add a `NABXtraResultRow`to the `NABResultAll` in column major form. Note that the some
* result files have an additional raw score. In this case we store this data.
* result files have an additional raw score. In this case we store this data.
*/
def addTo(acc: NABResultAll, e: NABXtraResultRow) =
NABResultAll(e.dt :: acc.dt, e.value :: acc.value, e.anomaly_score :: acc.anomaly_score,
e.raw_score :: acc.raw_score, e.label :: acc.label,
e.reward_low_FP_rate :: acc.reward_low_FP_rate,
e.reward_low_FN_rate :: acc.reward_low_FN_rate, e.standard :: acc.standard)

/**
* When converting the rows to columns we stack the `NABDataRow` data in reverse order.
* This reverses those columns to get the correct order back.
Expand Down Expand Up @@ -710,7 +711,7 @@ object NABUtils {
if (!tmp._2.isEmpty) Left(tmp._2.reverse) else Right(NABResultAll(r.dt.reverse, r.value.reverse, r.anomaly_score.reverse, r.raw_score.reverse, r.label.reverse,
r.reward_low_FP_rate.reverse, r.reward_low_FN_rate.reverse, r.standard.reverse))
}

def toNABResultXtraAllColumns(reader: kantan.csv.CsvReader[kantan.csv.ReadResult[NABXtraResultRow]]): Either[List[Throwable], NABResultAll] = {
val z = (emptyNABResultAll, List[Throwable]())
val tmp = reader.foldLeft(z) {
Expand Down Expand Up @@ -812,6 +813,8 @@ object NABUtils {
updateHash(t); updateHash(v)
}

import scala.collection.mutable.ArrayBuffer

/**
* This function reads the NAB result file (contains [[NABResultRow]]) and uses the first
* `sample_size` (fraction of the total number of lines, default set to 0.15). It then generates the
Expand All @@ -827,14 +830,18 @@ object NABUtils {
* import kantan.csv.generic._
* }}}
*
* Note: We have to use an ArrayByffer because Java's Array "equals" only compares
* the object references and not the object's contents. This does not allow the use of
* collections such as Map.
*
* @see updateHash(dt : Instant)
* @see updateHash(value : Double)
* @see https://softwarecave.org/2014/02/26/calculating-cryptographic-hash-functions-in-java/
* @see https://docs.oracle.com/javase/8/docs/api/java/security/MessageDigest.html
* @see https://www.mkyong.com/java/java-sha-hashing-example/
* @see https://github.com/alphazero/Blake2b
*/
def tagData(f: File, sample_size: Double = 0.15)(implicit dt1: RowDecoder[NABResultRow], dt2: RowDecoder[NABXtraResultRow], digest: MessageDigest): Either[List[Throwable], Array[Byte]] = {
def tagData(f: File, sample_size: Double = 0.15)(implicit dt1: RowDecoder[NABResultRow], dt2: RowDecoder[NABXtraResultRow], digest: MessageDigest): Either[List[Throwable], ArrayBuffer[Byte]] = {
val results = loadResults(f)
//implicit val digest = MessageDigest.getInstance("SHA-256")
//println(digest.getAlgorithm)
Expand All @@ -846,14 +853,17 @@ object NABUtils {
val values = x.value.take(sample_len)
val r = dts.zip(values)
r.map { case (t, v) => hashDataPair(t, v) }
Right(digest.digest)
Right( ArrayBuffer(digest.digest: _*) )
}
}

object Hex {
def valueOf(buf: Array[Byte]): String = buf.map("%02X" format _).mkString
def valueOf(buf: ArrayBuffer[Byte]): String = buf.map("%02X" format _).mkString
}

//import collection.JavaConverters._

/**
* This function takes in a list of `better.files.File`s reads the start of these files
* (only the `sample_size` fraction of lines are used) and uses that data to generate a hash
Expand All @@ -862,17 +872,23 @@ object NABUtils {
* perfect score output based on the anomaly window labels.
*
* Note: we truncate the list of parsing errors in `Left[List[Throwable]]` otherwise if
* parsing errors occur in many files, the exception data will accumulate causing an
* out-of-memory error.
* parsing errors occur in many files, the exception data will accumulate causing an
* out-of-memory error.
*
* Note: We have to use an ArrayByffer because Java's Array "equals" only compares
* the object references and not the object's contents. This does not allow the use of
* collections such as Map.
*
* @see http://stackoverflow.com/questions/6489584/best-way-to-turn-a-lists-of-eithers-into-an-either-of-lists
*/
def tagFiles(files: List[File], sample_size: Double = 0.15)(implicit dt1: RowDecoder[NABResultRow], dt2: RowDecoder[NABXtraResultRow], digest: MessageDigest): Either[List[Throwable], Map[Array[Byte], String]] = {
def tagFiles(files: List[File], sample_size: Double = 0.15)(implicit dt1: RowDecoder[NABResultRow], dt2: RowDecoder[NABXtraResultRow], digest: MessageDigest): //Either[List[Throwable], Map[Array[Byte], String]] = {
Either[List[Throwable], Map[ArrayBuffer[Byte], String]] = {
val t = files.map { file =>
val keys = tagData(file, sample_size)
// We don't do a simple map otherwise we would collect all parsng errors
//keys.map(hash => (hash, file.nameWithoutExtension))
keys.fold( {err => Left(err.take(10))}, {hash => Right((hash, file.nameWithoutExtension))})
keys.fold({ err => Left(List(new Throwable(file.name)) ++ err.take(10)) },
{ hash => Right((hash, file.path.toString)) })
}
val t3 = t.partition(_.isLeft)
t3 match {
Expand Down
65 changes: 56 additions & 9 deletions src/test/scala/pt/inescn/utils/NABUtilsSpec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -735,17 +735,18 @@ class NABUtilsSpec extends FlatSpec with Matchers {
import kantan.csv.ops._
import kantan.csv.generic._

import scala.collection.mutable.ArrayBuffer
//import java.security.MessageDigest
implicit val digest = MessageDigest.getInstance( "SHA-256" )
val hashf1e : Either[List[Throwable], Array[Byte]] = tagData(labelledp1, sample_size = 0.15 )
val hashf1e : Either[List[Throwable], ArrayBuffer[Byte]] = tagData(labelledp1, sample_size = 0.15 )
hashf1e.isRight should be ( true )
val hashf1 = hashf1e.right.get
//println( Hex.valueOf( hashf1 ) )
val hexf1 = Hex.valueOf( hashf1 )
hexf1.size shouldBe ( hashlen_256 )

digest.reset
val hashf2e : Either[List[Throwable], Array[Byte]] = tagData( labelledp2, sample_size = 0.15 )
val hashf2e : Either[List[Throwable], ArrayBuffer[Byte]] = tagData( labelledp2, sample_size = 0.15 )
//println(hashf2e)
hashf2e.isRight should be ( true )
val hashf2 = hashf2e.right.get
Expand Down Expand Up @@ -781,7 +782,8 @@ class NABUtilsSpec extends FlatSpec with Matchers {
// 256 bits -> / 8 bytes -> * 2 Hex digits (4 bits)
val hashlen_256 = (256 / 8) * 2

val hashf1e : Either[List[Throwable], Array[Byte]] = tagData(labelledp1, sample_size = 0.15 )
import scala.collection.mutable.ArrayBuffer
val hashf1e : Either[List[Throwable], ArrayBuffer[Byte]] = tagData(labelledp1, sample_size = 0.15 )
hashf1e.isRight should be ( true )
val hashf1 = hashf1e.right.get
//println( Hex.valueOf( hashf1 ) )
Expand All @@ -798,11 +800,9 @@ class NABUtilsSpec extends FlatSpec with Matchers {
import NABUtils.NABDataRow._

val data = cwd / "data/nab/results"
// TODO: use File directly
println(data.path.toString)
val dataFiles = allDataFiles(data)
dataFiles shouldBe 'defined
println( dataFiles.mkString(",") )
//println( dataFiles.mkString(",") )

// We need to bring in shapeless "compile time reflection"
// https://nrinaudo.github.io/kantan.csv/tut/shapeless.html
Expand All @@ -813,12 +813,59 @@ class NABUtilsSpec extends FlatSpec with Matchers {
import java.security.MessageDigest
implicit val digest = MessageDigest.getInstance( "SHA-256" )

// Generate the hashes to find the labels
val fileHash = dataFiles.map{ x =>
println(x.take(1)(0).path.toString)
tagFiles( x.take(1), sample_size = 0.1 )
tagFiles( x, sample_size = 0.1 )
}
println(fileHash)
//println(fileHash)
fileHash shouldBe 'defined
fileHash.get.isRight should be ( true )
fileHash.get.right.get.size should be > 1

// Are ArrayBuffer good for keys
import scala.collection.mutable.ArrayBuffer
val a1 = ArrayBuffer(-62, 103, -57, 6, 121, 31, 15, 1, 8, 57, -111, 116, 31, 70, -105, -123, -54, -110, -61, 28, 114, -118, 50, -38, 92, -36, 86, -79, 51, 30, 44, -7)
val a2 = ArrayBuffer(-62, 103, -57, 6, 121, 31, 15, 1, 8, 57, -111, 116, 31, 70, -105, -123, -54, -110, -61, 28, 114, -118, 50, -38, 92, -36, 86, -79, 51, 30, 44, -7)
//println("a1 == a2 = " + (a1 == a2))
//println("a1 equals a2 = " + (a1.equals(a2)))
a1 shouldBe a2

// NOTE: IMPORTANT we list and process all result files because they are the only ones with the labels.
// However, we only use the time-stamp and values to generate the hash keys. This means that because the
// same data is used by various algorithms, we will generate the same hash several time over. So the
// Map will not contain all the result files (last files overwrite the initial ones).

// So pick a file at random
val dataMap = fileHash.get.right.get
val x = dataMap.values.toList
val y = dataMap.toList
val tesFile = scala.util.Random.shuffle(y).take(1)(0)
//println(labelledp1.toString)
println(" dataMap.values.conatains(file) = " + tesFile.toString)
val labelledp1 = File(tesFile._2)
val w = tesFile._1
//println(dataMap)

// Now calculate the hash for that file and see if we can find it
// Make sure we restart the hash generator
digest.reset

// 256 bits -> / 8 bytes -> * 2 Hex digits (4 bits)
val hashlen_256 = (256 / 8) * 2

val hashf1e : Either[List[Throwable], ArrayBuffer[Byte]] = tagData(labelledp1, sample_size = 0.1 )
hashf1e.isRight should be ( true )
val hashf1 = hashf1e.right.get
val hexf1 = Hex.valueOf( hashf1 )
hexf1.size shouldBe ( hashlen_256 )

// Same hash?
Hex.valueOf( w ) shouldBe hexf1
w should be (hashf1)

// In the Map?
dataMap.get( hashf1 ) shouldBe 'defined
dataMap.get( hashf1 ).get shouldBe labelledp1.toString
}

it should "should add the detection correctly" in {
Expand Down

0 comments on commit 96f9ec0

Please sign in to comment.