From f3ee0adc079f396cb2e64e1ed71db0ffc1be8293 Mon Sep 17 00:00:00 2001 From: Sean Chester Date: Sat, 14 Nov 2015 19:36:22 +0100 Subject: [PATCH] Initial release of Generalised Brown source code. --- README.md | 165 +++ cluster_generator/LICENSE.md | 19 + cluster_generator/cluster.py | 70 + merge_generator/CHANGE_LOG.md | 22 + merge_generator/LICENSE.md | 15 + merge_generator/Makefile | 16 + merge_generator/basic/city.cc | 466 +++++++ merge_generator/basic/city.h | 90 ++ merge_generator/basic/hard-ofstream.h | 32 + merge_generator/basic/indent.cc | 3 + merge_generator/basic/indent.h | 18 + merge_generator/basic/lisp.cc | 129 ++ merge_generator/basic/lisp.h | 33 + merge_generator/basic/logging.cc | 145 ++ merge_generator/basic/logging.h | 122 ++ merge_generator/basic/mem-tracker.cc | 53 + merge_generator/basic/mem-tracker.h | 132 ++ merge_generator/basic/mem.h | 14 + merge_generator/basic/multi-ostream.cc | 61 + merge_generator/basic/multi-ostream.h | 67 + merge_generator/basic/opt.cc | 189 +++ merge_generator/basic/opt.h | 100 ++ merge_generator/basic/pipe.h | 46 + merge_generator/basic/prob-utils.cc | 75 + merge_generator/basic/prob-utils.h | 19 + merge_generator/basic/stats.cc | 1 + merge_generator/basic/stats.h | 71 + merge_generator/basic/std.cc | 111 ++ merge_generator/basic/std.h | 110 ++ merge_generator/basic/stl-basic.cc | 1 + merge_generator/basic/stl-basic.h | 113 ++ merge_generator/basic/stl-utils.cc | 1 + merge_generator/basic/stl-utils.h | 232 +++ merge_generator/basic/str-str-db.cc | 35 + merge_generator/basic/str-str-db.h | 19 + merge_generator/basic/str.cc | 91 ++ merge_generator/basic/str.h | 22 + merge_generator/basic/strdb.cc | 209 +++ merge_generator/basic/strdb.h | 101 ++ merge_generator/basic/timer.cc | 11 + merge_generator/basic/timer.h | 35 + merge_generator/basic/union-set.cc | 29 + merge_generator/basic/union-set.h | 22 + merge_generator/cluster-viewer/LICENSE | 22 + merge_generator/cluster-viewer/README.md | 26 + .../cluster-viewer/build-viewer.sh | 32 + merge_generator/cluster-viewer/code/final.py | 8 + .../cluster-viewer/code/htmlrows.html | 18 + .../cluster-viewer/code/make_html.py | 75 + merge_generator/cluster-viewer/code/style.css | 9 + .../cluster-viewer/code/template.html | 22 + merge_generator/input.txt | 3 + merge_generator/output.txt | 5 + merge_generator/wcluster.cc | 1238 +++++++++++++++++ 54 files changed, 4773 insertions(+) create mode 100644 README.md create mode 100644 cluster_generator/LICENSE.md create mode 100755 cluster_generator/cluster.py create mode 100644 merge_generator/CHANGE_LOG.md create mode 100644 merge_generator/LICENSE.md create mode 100644 merge_generator/Makefile create mode 100644 merge_generator/basic/city.cc create mode 100644 merge_generator/basic/city.h create mode 100644 merge_generator/basic/hard-ofstream.h create mode 100644 merge_generator/basic/indent.cc create mode 100644 merge_generator/basic/indent.h create mode 100644 merge_generator/basic/lisp.cc create mode 100644 merge_generator/basic/lisp.h create mode 100644 merge_generator/basic/logging.cc create mode 100644 merge_generator/basic/logging.h create mode 100644 merge_generator/basic/mem-tracker.cc create mode 100644 merge_generator/basic/mem-tracker.h create mode 100644 merge_generator/basic/mem.h create mode 100644 merge_generator/basic/multi-ostream.cc create mode 100644 merge_generator/basic/multi-ostream.h create mode 100644 merge_generator/basic/opt.cc create mode 100644 merge_generator/basic/opt.h create mode 100644 merge_generator/basic/pipe.h create mode 100644 merge_generator/basic/prob-utils.cc create mode 100644 merge_generator/basic/prob-utils.h create mode 100644 merge_generator/basic/stats.cc create mode 100644 merge_generator/basic/stats.h create mode 100644 merge_generator/basic/std.cc create mode 100644 merge_generator/basic/std.h create mode 100644 merge_generator/basic/stl-basic.cc create mode 100644 merge_generator/basic/stl-basic.h create mode 100644 merge_generator/basic/stl-utils.cc create mode 100644 merge_generator/basic/stl-utils.h create mode 100644 merge_generator/basic/str-str-db.cc create mode 100644 merge_generator/basic/str-str-db.h create mode 100644 merge_generator/basic/str.cc create mode 100644 merge_generator/basic/str.h create mode 100644 merge_generator/basic/strdb.cc create mode 100644 merge_generator/basic/strdb.h create mode 100644 merge_generator/basic/timer.cc create mode 100644 merge_generator/basic/timer.h create mode 100644 merge_generator/basic/union-set.cc create mode 100644 merge_generator/basic/union-set.h create mode 100644 merge_generator/cluster-viewer/LICENSE create mode 100644 merge_generator/cluster-viewer/README.md create mode 100755 merge_generator/cluster-viewer/build-viewer.sh create mode 100644 merge_generator/cluster-viewer/code/final.py create mode 100644 merge_generator/cluster-viewer/code/htmlrows.html create mode 100644 merge_generator/cluster-viewer/code/make_html.py create mode 100644 merge_generator/cluster-viewer/code/style.css create mode 100644 merge_generator/cluster-viewer/code/template.html create mode 100644 merge_generator/input.txt create mode 100644 merge_generator/output.txt create mode 100755 merge_generator/wcluster.cc diff --git a/README.md b/README.md new file mode 100644 index 0000000..8b3fa87 --- /dev/null +++ b/README.md @@ -0,0 +1,165 @@ +## generalised-brown +version 1.0 +© 2015 Sean Chester and Leon Derczynski + +------------------------------------------- +### Table of Contents + + * [Introduction](#introduction) + * [Requirements](#requirements) + * [Installation](#installation) + * [Usage](#usage) + * [License](#license) + * [Contact](#contact) + + +------------------------------------ +### Introduction + + +The *generalised-brown* software suite clusters word types by +distributional similarity in two phases. It first generates a list +of merges based on the well-known Brown clustering algorithm and +then recalls historical states to vary the granularity of the +clusters. For example, given the following corpus: + +> Alice likes dogs and Bob likes cats while Alice hates snakes and Bob hates spiders + +Greedily clustering word types based on *average mutual information* +(i.e., running the *C++ merge generator*) produces the following +merge list (assuming _a_ = _|V|_ = 10): + +> snakes spiders 8 +> dogs cats 7 +> Alice Bob 6 +> and while 5 +> likes hates 4 +> dogs snakes 3 +> dogs and 2 +> dogs Alice 1 +> dogs likes 0 + +One can then recall any historical state of the computation in order to +produce a set of clusters (i.e., run the *python cluster generator*). +For example, with _c_ = 5, we recall the state _c_ - 1 = 4 to produce +the following clusters: + +> {snakes, spiders} +> {dogs, cats} +> {Alice, Bob} +> {likes, hates} +> {and, while} + +This approach (setting separate values of _a_ and _c_) we refer to as +*Roll-up feature generation*. By contrast, traditional Brown clustering +would produce the following five clusters (equivalent to running the +*C++ merge generator* with _a_ = 5 **and** the *python cluster generator* +with _c_ = 5): + +> {likes, hates} +> {snakes, spiders, cats, dogs} +> {and, while} +> {Alice} +> {Bob} + +For details about the concepts implemented in this software, please +read our recent AAAI paper: + +> L. Derczynski and S. Chester. 2016. "Generalised Brown Clustering +> and Roll-up Feature Generation." In: Proceedings of the +> Thirtieth AAAI Conference on Artificial Intelligence (AAAI-16). +> 7 pages. To appear. + +For details about traditional Brown clustering, consult the article +in which it was introduced: + +> PF Brown et al. 1992. "Class-based n-gram models of natural language." +> Computational Linguistics 18(4): 467--479. + +or the implementation that our *C++ merge generator* forked: + +> [wcluster](https://github.com/percyliang/brown-cluster). + + +------------------------------------ +### Requirements + + +*generalised-brown* relies on the following applications: + + + For compiling the *C++ merge generator*: A C++ compiler that + is compatible with C++ 11 and OpenMP (e.g., the newest + [GNU compiler](https://gcc.gnu.org/)) and the *make* program + + + For running the *python cluster generator*: A *python* + interpreter + +------------------------------------ +### Installation + + +The *python cluster generator* does not need to be compiled. +To compile the *C++ merge generator*, navigate to the +*merge_generator/* subdirectory of the project and type: + +>make + +------------------------------------ +### Usage + + +To produce a set of features for a corpus, you will first want to use +Generalised Brown (i.e., the *C++ merge generator*) to create a merge list. +Then, you can create c clusters by running the *python cluster generator* +on the merge list. This second step can be done for as many values of _c_ +as you like, but we recommend that each value of _c_ is not larger than the +value of _a_ used to generate the merge list. + +To run the *C++ merge generator*, type: + +>./merge_generator/wcluster --text [input_file] --a [active_set_size] + +The resultant merges will be recorded in: + +>./[input_file]-c[active_set_size]-p1.out/merges + +To run the *python cluster generator*, type: + +>python ./cluster_generator/cluster.py -in ./[input_file]-c[active_set_size]-p1.out/merges -c 3 + +Each word type will be printed to *stdout* with its cluster id. + +The *C++ merge generator* runs in _O(|V| a^2)_ time, where _|V|_ is the number +of distinct word types in the corpus (i.e., the size of the vocabulary) and +_a_ is a bound on the algorithm's search space. The *python cluster generator* +runs in _O(|V|)_ time. + + +------------------------------------ +### License + + +This software consists of two sub-modules, each released under a +different license: + + + The *python cluster generator* is subject to the terms of +[The MIT License](http://opensource.org/licenses/MIT) + + + The *C++ merge generator* follows the original licensing terms +of [wcluster](https://github.com/percyliang/brown-cluster). + +See the relevant sub-directories of this repository for the +specific details of each license. + + + +------------------------------------ +### Contact + + +This software suite will undergo a major revision; so, you are encouraged +to ensure that this is still the latest version. Please do not hesitate to +contact the authors if you have comments, questions, or bugs to report. +>[generalised-brown on GitHub](https://github.com/sean-chester/generalised-brown) + +------------------------------------ \ No newline at end of file diff --git a/cluster_generator/LICENSE.md b/cluster_generator/LICENSE.md new file mode 100644 index 0000000..fe67eb2 --- /dev/null +++ b/cluster_generator/LICENSE.md @@ -0,0 +1,19 @@ +Copyright (c) 2015 Sean Chester and Leon Derczynski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/cluster_generator/cluster.py b/cluster_generator/cluster.py new file mode 100755 index 0000000..0c034e7 --- /dev/null +++ b/cluster_generator/cluster.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# cluster.py +# © Sean Chester (sean.chester@idi.ntnu.no) +# 22 July 2015 + +import csv +import argparse + +# Input parsing +parser = argparse.ArgumentParser( + description='Prints out a tree with a specified number of leaves, given an ' + \ + 'input file with an ordered list of merges. Each unique path identifies ' + \ + 'one leaf. All word types that have the same path as each other belong to the ' + \ + 'same leaf (and correspond to one Brown cluster).', \ + epilog='If the output is to be read by humans, consider piping results to ' + \ + 'the sort command to print the leaves in depth-first order. (Then ' + \ + 'similar leaves/clusters will appear nearer each other in the output.)') +parser.add_argument( + '-in', '--input-file', \ + help="Input file containing ordered merges", \ + required=True, \ + dest='input', \ + metavar='INPUT_FILE') +parser.add_argument( + '-c', '--num-classes', \ + type=int, \ + help="Number of leaves/classes/clusters to produce", \ + required=True, \ + dest='leaves', \ + metavar='NUM_CLASSES') +parser.add_argument( + '-d', '--depth', \ + type=int, \ + help="Truncation depth for paths (i.e., no leaf appears farther than d-1 hops from the " + \ + "root). Note: setting this parametre likely results in fewer than NUM_CLASSES leaves, " + \ + "because the --num-classes filter is (logically) applied first.", \ + required=False, \ + dest='depth') +args = parser.parse_args() + +# If depth wasn't passed as a parametre, give it a default value of being +# equal to --num-classes. +if args.depth is None: + args.depth = args.leaves + +# Actual processing -- read merge list in reverse and map each encountered +# word type onto a tree path in a dictionary. +tree = {} +with open( args.input ) as tsv: + for line in reversed(list(csv.reader(tsv, delimiter="\t", quotechar=None))): + merge_into = line[0] + merge_from = line[1] + if not tree.has_key(merge_into): + tree[merge_into] = "0" + tree[merge_from] = "1" + args.leaves = args.leaves - 2 + elif args.leaves > 0: + parent = tree[merge_into] + if len( parent ) < args.depth: + tree[merge_from] = parent + "1" + tree[merge_into] = parent + "0" + else: + tree[merge_from] = parent + args.leaves = args.leaves - 1 + else: + tree[merge_from] = tree[merge_into] + +for (cluster, path) in tree.items(): + print( path + "\t" + cluster ) + diff --git a/merge_generator/CHANGE_LOG.md b/merge_generator/CHANGE_LOG.md new file mode 100644 index 0000000..1fe094a --- /dev/null +++ b/merge_generator/CHANGE_LOG.md @@ -0,0 +1,22 @@ +# Change Log + +-------------------- + +## 1.3.1: [Sean Chester](https://github.com/sean-chester) + + Added conceptual generalisation whereby every merge is logged so that + historical states can be recalled with ../cluster_generator/cluster.py. + + Added more parallelism (courtesy of + [Kenneth S Bøgh](https://dk.linkedin.com/in/kenneth-sejdenfaden-bøgh-58915524)). + + Aliased the input parametre _c_ as _a_ to fit the conceptual generalisation + (while maintaining backwards compatibility). + +## 1.3: [Percy Liang](https://github.com/percyliang) + + compatibility updates for newer versions of g++ (courtesy of Chris Dyer). + +## 1.2: [Percy Liang](https://github.com/percyliang) + + make compatible with MacOS (replaced timespec with timeval and changed order of linking). + +## 1.1: [Percy Liang](https://github.com/percyliang) + + Removed deprecated operators so it works with GCC 4.3. + +-------------------- \ No newline at end of file diff --git a/merge_generator/LICENSE.md b/merge_generator/LICENSE.md new file mode 100644 index 0000000..d3aaef3 --- /dev/null +++ b/merge_generator/LICENSE.md @@ -0,0 +1,15 @@ +(C) Copyright 2015 (Sean Chester)[https://github.com/sean-chester] +and (Leon Derczynski)[http://derczynski.com/] +(C) Copyright 2007-2012, Percy Liang + +http://cs.stanford.edu/~pliang + +Permission is granted for anyone to copy, use, or modify these programs and +accompanying documents for purposes of research or education, provided this +copyright notice is retained, and note is made of any changes that have been +made. + +These programs and documents are distributed without any warranty, express or +implied. As the programs were written for research purposes only, they have +not been tested to the degree that would be advisable in any important +application. All use of these programs is entirely at the user's own risk. \ No newline at end of file diff --git a/merge_generator/Makefile b/merge_generator/Makefile new file mode 100644 index 0000000..95b6c38 --- /dev/null +++ b/merge_generator/Makefile @@ -0,0 +1,16 @@ +# 1.2: need to make sure opt.o goes in the right order to get the right scope on the command-line arguments +# Use this for Linux +ifeq ($(shell uname),Linux) + files=$(subst .cc,.o,basic/logging.cc $(shell /bin/ls *.cc) $(shell /bin/ls basic/*.cc | grep -v logging.cc)) +else + files=$(subst .cc,.o,basic/opt.cc $(shell /bin/ls *.cc) $(shell /bin/ls basic/*.cc | grep -v opt.cc)) +endif + +wcluster: $(files) + g++ -Wall -g -std=c++0x -O3 -fopenmp -o wcluster $(files) -lpthread + +%.o: %.cc + g++ -Wall -g -O3 -fopenmp -std=c++0x -o $@ -c $< + +clean: + rm wcluster basic/*.o *.o diff --git a/merge_generator/basic/city.cc b/merge_generator/basic/city.cc new file mode 100644 index 0000000..5fc8aa3 --- /dev/null +++ b/merge_generator/basic/city.cc @@ -0,0 +1,466 @@ +// Copyright (c) 2011 Google, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +// CityHash, by Geoff Pike and Jyrki Alakuijala +// +// This file provides CityHash64() and related functions. +// +// It's probably possible to create even faster hash functions by +// writing a program that systematically explores some of the space of +// possible hash functions, by using SIMD instructions, or by +// compromising on hash quality. + +#include "city.h" + +#include +#include // for memcpy and memset + +using namespace std; + +static uint64 UNALIGNED_LOAD64(const char *p) { + uint64 result; + memcpy(&result, p, sizeof(result)); + return result; +} + +static uint32 UNALIGNED_LOAD32(const char *p) { + uint32 result; + memcpy(&result, p, sizeof(result)); + return result; +} + +#if !defined(WORDS_BIGENDIAN) + +#define uint32_in_expected_order(x) (x) +#define uint64_in_expected_order(x) (x) + +#else + +#ifdef _MSC_VER +#include +#define bswap_32(x) _byteswap_ulong(x) +#define bswap_64(x) _byteswap_uint64(x) + +#elif defined(__APPLE__) +// Mac OS X / Darwin features +#include +#define bswap_32(x) OSSwapInt32(x) +#define bswap_64(x) OSSwapInt64(x) + +#else +#include +#endif + +#define uint32_in_expected_order(x) (bswap_32(x)) +#define uint64_in_expected_order(x) (bswap_64(x)) + +#endif // WORDS_BIGENDIAN + +#if !defined(LIKELY) +#if HAVE_BUILTIN_EXPECT +#define LIKELY(x) (__builtin_expect(!!(x), 1)) +#else +#define LIKELY(x) (x) +#endif +#endif + +static uint64 Fetch64(const char *p) { + return uint64_in_expected_order(UNALIGNED_LOAD64(p)); +} + +static uint32 Fetch32(const char *p) { + return uint32_in_expected_order(UNALIGNED_LOAD32(p)); +} + +// Some primes between 2^63 and 2^64 for various uses. +static const uint64 k0 = 0xc3a5c85c97cb3127ULL; +static const uint64 k1 = 0xb492b66fbe98f273ULL; +static const uint64 k2 = 0x9ae16a3b2f90404fULL; +static const uint64 k3 = 0xc949d7c7509e6557ULL; + +// Bitwise right rotate. Normally this will compile to a single +// instruction, especially if the shift is a manifest constant. +static uint64 Rotate(uint64 val, int shift) { + // Avoid shifting by 64: doing so yields an undefined result. + return shift == 0 ? val : ((val >> shift) | (val << (64 - shift))); +} + +// Equivalent to Rotate(), but requires the second arg to be non-zero. +// On x86-64, and probably others, it's possible for this to compile +// to a single instruction if both args are already in registers. +static uint64 RotateByAtLeast1(uint64 val, int shift) { + return (val >> shift) | (val << (64 - shift)); +} + +static uint64 ShiftMix(uint64 val) { + return val ^ (val >> 47); +} + +static uint64 HashLen16(uint64 u, uint64 v) { + return Hash128to64(uint128(u, v)); +} + +static uint64 HashLen0to16(const char *s, size_t len) { + if (len > 8) { + uint64 a = Fetch64(s); + uint64 b = Fetch64(s + len - 8); + return HashLen16(a, RotateByAtLeast1(b + len, len)) ^ b; + } + if (len >= 4) { + uint64 a = Fetch32(s); + return HashLen16(len + (a << 3), Fetch32(s + len - 4)); + } + if (len > 0) { + uint8 a = s[0]; + uint8 b = s[len >> 1]; + uint8 c = s[len - 1]; + uint32 y = static_cast(a) + (static_cast(b) << 8); + uint32 z = len + (static_cast(c) << 2); + return ShiftMix(y * k2 ^ z * k3) * k2; + } + return k2; +} + +// This probably works well for 16-byte strings as well, but it may be overkill +// in that case. +static uint64 HashLen17to32(const char *s, size_t len) { + uint64 a = Fetch64(s) * k1; + uint64 b = Fetch64(s + 8); + uint64 c = Fetch64(s + len - 8) * k2; + uint64 d = Fetch64(s + len - 16) * k0; + return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d, + a + Rotate(b ^ k3, 20) - c + len); +} + +// Return a 16-byte hash for 48 bytes. Quick and dirty. +// Callers do best to use "random-looking" values for a and b. +static pair WeakHashLen32WithSeeds( + uint64 w, uint64 x, uint64 y, uint64 z, uint64 a, uint64 b) { + a += w; + b = Rotate(b + a + z, 21); + uint64 c = a; + a += x; + a += y; + b += Rotate(a, 44); + return make_pair(a + z, b + c); +} + +// Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty. +static pair WeakHashLen32WithSeeds( + const char* s, uint64 a, uint64 b) { + return WeakHashLen32WithSeeds(Fetch64(s), + Fetch64(s + 8), + Fetch64(s + 16), + Fetch64(s + 24), + a, + b); +} + +// Return an 8-byte hash for 33 to 64 bytes. +static uint64 HashLen33to64(const char *s, size_t len) { + uint64 z = Fetch64(s + 24); + uint64 a = Fetch64(s) + (len + Fetch64(s + len - 16)) * k0; + uint64 b = Rotate(a + z, 52); + uint64 c = Rotate(a, 37); + a += Fetch64(s + 8); + c += Rotate(a, 7); + a += Fetch64(s + 16); + uint64 vf = a + z; + uint64 vs = b + Rotate(a, 31) + c; + a = Fetch64(s + 16) + Fetch64(s + len - 32); + z = Fetch64(s + len - 8); + b = Rotate(a + z, 52); + c = Rotate(a, 37); + a += Fetch64(s + len - 24); + c += Rotate(a, 7); + a += Fetch64(s + len - 16); + uint64 wf = a + z; + uint64 ws = b + Rotate(a, 31) + c; + uint64 r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0); + return ShiftMix(r * k0 + vs) * k2; +} + +uint64 CityHash64(const char *s, size_t len) { + if (len <= 32) { + if (len <= 16) { + return HashLen0to16(s, len); + } else { + return HashLen17to32(s, len); + } + } else if (len <= 64) { + return HashLen33to64(s, len); + } + + // For strings over 64 bytes we hash the end first, and then as we + // loop we keep 56 bytes of state: v, w, x, y, and z. + uint64 x = Fetch64(s + len - 40); + uint64 y = Fetch64(s + len - 16) + Fetch64(s + len - 56); + uint64 z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24)); + pair v = WeakHashLen32WithSeeds(s + len - 64, len, z); + pair w = WeakHashLen32WithSeeds(s + len - 32, y + k1, x); + x = x * k1 + Fetch64(s); + + // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks. + len = (len - 1) & ~static_cast(63); + do { + x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1; + y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; + x ^= w.second; + y += v.first + Fetch64(s + 40); + z = Rotate(z + w.first, 33) * k1; + v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); + w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); + std::swap(z, x); + s += 64; + len -= 64; + } while (len != 0); + return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z, + HashLen16(v.second, w.second) + x); +} + +uint64 CityHash64WithSeed(const char *s, size_t len, uint64 seed) { + return CityHash64WithSeeds(s, len, k2, seed); +} + +uint64 CityHash64WithSeeds(const char *s, size_t len, + uint64 seed0, uint64 seed1) { + return HashLen16(CityHash64(s, len) - seed0, seed1); +} + +// A subroutine for CityHash128(). Returns a decent 128-bit hash for strings +// of any length representable in signed long. Based on City and Murmur. +static uint128 CityMurmur(const char *s, size_t len, uint128 seed) { + uint64 a = Uint128Low64(seed); + uint64 b = Uint128High64(seed); + uint64 c = 0; + uint64 d = 0; + signed long l = len - 16; + if (l <= 0) { // len <= 16 + a = ShiftMix(a * k1) * k1; + c = b * k1 + HashLen0to16(s, len); + d = ShiftMix(a + (len >= 8 ? Fetch64(s) : c)); + } else { // len > 16 + c = HashLen16(Fetch64(s + len - 8) + k1, a); + d = HashLen16(b + len, c + Fetch64(s + len - 16)); + a += d; + do { + a ^= ShiftMix(Fetch64(s) * k1) * k1; + a *= k1; + b ^= a; + c ^= ShiftMix(Fetch64(s + 8) * k1) * k1; + c *= k1; + d ^= c; + s += 16; + l -= 16; + } while (l > 0); + } + a = HashLen16(a, c); + b = HashLen16(d, b); + return uint128(a ^ b, HashLen16(b, a)); +} + +uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed) { + if (len < 128) { + return CityMurmur(s, len, seed); + } + + // We expect len >= 128 to be the common case. Keep 56 bytes of state: + // v, w, x, y, and z. + pair v, w; + uint64 x = Uint128Low64(seed); + uint64 y = Uint128High64(seed); + uint64 z = len * k1; + v.first = Rotate(y ^ k1, 49) * k1 + Fetch64(s); + v.second = Rotate(v.first, 42) * k1 + Fetch64(s + 8); + w.first = Rotate(y + z, 35) * k1 + x; + w.second = Rotate(x + Fetch64(s + 88), 53) * k1; + + // This is the same inner loop as CityHash64(), manually unrolled. + do { + x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1; + y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; + x ^= w.second; + y += v.first + Fetch64(s + 40); + z = Rotate(z + w.first, 33) * k1; + v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); + w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); + std::swap(z, x); + s += 64; + x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1; + y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; + x ^= w.second; + y += v.first + Fetch64(s + 40); + z = Rotate(z + w.first, 33) * k1; + v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); + w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); + std::swap(z, x); + s += 64; + len -= 128; + } while (LIKELY(len >= 128)); + x += Rotate(v.first + z, 49) * k0; + z += Rotate(w.first, 37) * k0; + // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s. + for (size_t tail_done = 0; tail_done < len; ) { + tail_done += 32; + y = Rotate(x + y, 42) * k0 + v.second; + w.first += Fetch64(s + len - tail_done + 16); + x = x * k0 + w.first; + z += w.second + Fetch64(s + len - tail_done); + w.second += v.first; + v = WeakHashLen32WithSeeds(s + len - tail_done, v.first + z, v.second); + } + // At this point our 56 bytes of state should contain more than + // enough information for a strong 128-bit hash. We use two + // different 56-byte-to-8-byte hashes to get a 16-byte final result. + x = HashLen16(x, v.first); + y = HashLen16(y + z, w.first); + return uint128(HashLen16(x + v.second, w.second) + y, + HashLen16(x + w.second, y + v.second)); +} + +uint128 CityHash128(const char *s, size_t len) { + if (len >= 16) { + return CityHash128WithSeed(s + 16, + len - 16, + uint128(Fetch64(s) ^ k3, + Fetch64(s + 8))); + } else if (len >= 8) { + return CityHash128WithSeed(NULL, + 0, + uint128(Fetch64(s) ^ (len * k0), + Fetch64(s + len - 8) ^ k1)); + } else { + return CityHash128WithSeed(s, len, uint128(k0, k1)); + } +} + +#ifdef __SSE4_2__ +#include +#include + +// Requires len >= 240. +static void CityHashCrc256Long(const char *s, size_t len, + uint32 seed, uint64 *result) { + uint64 a = Fetch64(s + 56) + k0; + uint64 b = Fetch64(s + 96) + k0; + uint64 c = result[0] = HashLen16(b, len); + uint64 d = result[1] = Fetch64(s + 120) * k0 + len; + uint64 e = Fetch64(s + 184) + seed; + uint64 f = seed; + uint64 g = 0; + uint64 h = 0; + uint64 i = 0; + uint64 j = 0; + uint64 t = c + d; + + // 240 bytes of input per iter. + size_t iters = len / 240; + len -= iters * 240; + do { +#define CHUNK(multiplier, z) \ + { \ + uint64 old_a = a; \ + a = Rotate(b, 41 ^ z) * multiplier + Fetch64(s); \ + b = Rotate(c, 27 ^ z) * multiplier + Fetch64(s + 8); \ + c = Rotate(d, 41 ^ z) * multiplier + Fetch64(s + 16); \ + d = Rotate(e, 33 ^ z) * multiplier + Fetch64(s + 24); \ + e = Rotate(t, 25 ^ z) * multiplier + Fetch64(s + 32); \ + t = old_a; \ + } \ + f = _mm_crc32_u64(f, a); \ + g = _mm_crc32_u64(g, b); \ + h = _mm_crc32_u64(h, c); \ + i = _mm_crc32_u64(i, d); \ + j = _mm_crc32_u64(j, e); \ + s += 40 + + CHUNK(1, 1); CHUNK(k0, 0); + CHUNK(1, 1); CHUNK(k0, 0); + CHUNK(1, 1); CHUNK(k0, 0); + } while (--iters > 0); + + while (len >= 40) { + CHUNK(k0, 0); + len -= 40; + } + if (len > 0) { + s = s + len - 40; + CHUNK(k0, 0); + } + j += i << 32; + a = HashLen16(a, j); + h += g << 32; + b += h; + c = HashLen16(c, f) + i; + d = HashLen16(d, e + result[0]); + j += e; + i += HashLen16(h, t); + e = HashLen16(a, d) + j; + f = HashLen16(b, c) + a; + g = HashLen16(j, i) + c; + result[0] = e + f + g + h; + a = ShiftMix((a + g) * k0) * k0 + b; + result[1] += a + result[0]; + a = ShiftMix(a * k0) * k0 + c; + result[2] = a + result[1]; + a = ShiftMix((a + e) * k0) * k0; + result[3] = a + result[2]; +} + +// Requires len < 240. +static void CityHashCrc256Short(const char *s, size_t len, uint64 *result) { + char buf[240]; + memcpy(buf, s, len); + memset(buf + len, 0, 240 - len); + CityHashCrc256Long(buf, 240, ~static_cast(len), result); +} + +void CityHashCrc256(const char *s, size_t len, uint64 *result) { + if (LIKELY(len >= 240)) { + CityHashCrc256Long(s, len, 0, result); + } else { + CityHashCrc256Short(s, len, result); + } +} + +uint128 CityHashCrc128WithSeed(const char *s, size_t len, uint128 seed) { + if (len <= 900) { + return CityHash128WithSeed(s, len, seed); + } else { + uint64 result[4]; + CityHashCrc256(s, len, result); + uint64 u = Uint128High64(seed) + result[0]; + uint64 v = Uint128Low64(seed) + result[1]; + return uint128(HashLen16(u, v + result[2]), + HashLen16(Rotate(v, 32), u * k0 + result[3])); + } +} + +uint128 CityHashCrc128(const char *s, size_t len) { + if (len <= 900) { + return CityHash128(s, len); + } else { + uint64 result[4]; + CityHashCrc256(s, len, result); + return uint128(result[2], result[3]); + } +} + +#endif diff --git a/merge_generator/basic/city.h b/merge_generator/basic/city.h new file mode 100644 index 0000000..c2ab352 --- /dev/null +++ b/merge_generator/basic/city.h @@ -0,0 +1,90 @@ +// Copyright (c) 2011 Google, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +// CityHash, by Geoff Pike and Jyrki Alakuijala +// +// This file provides a few functions for hashing strings. On x86-64 +// hardware in 2011, CityHash64() is faster than other high-quality +// hash functions, such as Murmur. This is largely due to higher +// instruction-level parallelism. CityHash64() and CityHash128() also perform +// well on hash-quality tests. +// +// CityHash128() is optimized for relatively long strings and returns +// a 128-bit hash. For strings more than about 2000 bytes it can be +// faster than CityHash64(). +// +// Functions in the CityHash family are not suitable for cryptography. +// +// WARNING: This code has not been tested on big-endian platforms! +// It is known to work well on little-endian platforms that have a small penalty +// for unaligned reads, such as current Intel and AMD moderate-to-high-end CPUs. +// +// By the way, for some hash functions, given strings a and b, the hash +// of a+b is easily derived from the hashes of a and b. This property +// doesn't hold for any hash functions in this file. + +#ifndef CITY_HASH_H_ +#define CITY_HASH_H_ + +#include // for size_t. +#include +#include + +typedef uint8_t uint8; +typedef uint32_t uint32; +typedef uint64_t uint64; +typedef std::pair uint128; + +inline uint64 Uint128Low64(const uint128& x) { return x.first; } +inline uint64 Uint128High64(const uint128& x) { return x.second; } + +// Hash function for a byte array. +uint64 CityHash64(const char *buf, size_t len); + +// Hash function for a byte array. For convenience, a 64-bit seed is also +// hashed into the result. +uint64 CityHash64WithSeed(const char *buf, size_t len, uint64 seed); + +// Hash function for a byte array. For convenience, two seeds are also +// hashed into the result. +uint64 CityHash64WithSeeds(const char *buf, size_t len, + uint64 seed0, uint64 seed1); + +// Hash function for a byte array. +uint128 CityHash128(const char *s, size_t len); + +// Hash function for a byte array. For convenience, a 128-bit seed is also +// hashed into the result. +uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed); + +// Hash 128 input bits down to 64 bits of output. +// This is intended to be a reasonably good hash function. +inline uint64 Hash128to64(const uint128& x) { + // Murmur-inspired hashing. + const uint64 kMul = 0x9ddfea08eb382d69ULL; + uint64 a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul; + a ^= (a >> 47); + uint64 b = (Uint128High64(x) ^ a) * kMul; + b ^= (b >> 47); + b *= kMul; + return b; +} + +#endif // CITY_HASH_H_ diff --git a/merge_generator/basic/hard-ofstream.h b/merge_generator/basic/hard-ofstream.h new file mode 100644 index 0000000..f74e62d --- /dev/null +++ b/merge_generator/basic/hard-ofstream.h @@ -0,0 +1,32 @@ +#ifndef __HARD_OFSTREAM_H__ +#define __HARD_OFSTREAM_H__ + +// On AFS, flushing a file writes it to the local disk but not AFS. +// Hard flushing ensures that the file will be written, by closing +// and re-opening the file. + +#include +#include + +using namespace std; + +class hard_ofstream : public ofstream { +public: + hard_ofstream() { } + hard_ofstream(const char *file, ofstream::openmode mode = ofstream::trunc) { open(file, mode); } + + void open(const char *file, ofstream::openmode mode = ofstream::trunc) { + ofstream::open(file, mode); + this->file = file; + } + + void hard_flush() { + close(); + open(file.c_str(), ofstream::app); + } + +private: + string file; +}; + +#endif diff --git a/merge_generator/basic/indent.cc b/merge_generator/basic/indent.cc new file mode 100644 index 0000000..f462700 --- /dev/null +++ b/merge_generator/basic/indent.cc @@ -0,0 +1,3 @@ +#include "indent.h" + +#include "opt.h" diff --git a/merge_generator/basic/indent.h b/merge_generator/basic/indent.h new file mode 100644 index 0000000..54fa0ca --- /dev/null +++ b/merge_generator/basic/indent.h @@ -0,0 +1,18 @@ +#ifndef __INDENT_H__ +#define __INDENT_H__ + +#include + +using namespace std; + +struct Indent { + Indent(int level) : level(level) { } + int level; +}; + +inline ostream &operator<<(ostream &out, const Indent &ind) { + for(int i = 0; i < ind.level; i++) out << " "; + return out; +} + +#endif diff --git a/merge_generator/basic/lisp.cc b/merge_generator/basic/lisp.cc new file mode 100644 index 0000000..36d2e16 --- /dev/null +++ b/merge_generator/basic/lisp.cc @@ -0,0 +1,129 @@ +#include "lisp.h" +#include "std.h" +#include "indent.h" + +void LispNode::destroy() { + forvec(_, LispNode *, node, children) { + node->destroy(); + delete node; + } +} + +void LispNode::print(int ind) const { + cout << Indent(ind) << (value.empty() ? "(empty)" : value) << endl; + forvec(_, LispNode *, subnode, children) + subnode->print(ind+1); +} + +//////////////////////////////////////////////////////////// + +LispTree::~LispTree() { + root->destroy(); + delete root; +} + +bool is_paren(char c) { + return c == '(' || c == ')' || c == '[' || c == ']'; +} +bool is_paren(string s) { + return s == "(" || s == ")" || s == "[" || s == "]"; +} +bool is_left_paren(string s) { + return s == "(" || s == "["; +} +bool is_right_paren(string s) { + return s == ")" || s == "]"; +} +string matching_right_paren(char c) { + if(c == '(') return ")"; + if(c == '[') return "]"; + return ""; +} + +// Return first non-space character. +char skip_space(istream &in) { + char c; + while(true) { + c = in.peek(); + if(!isspace(c)) break; + in.get(); + } + return c; +} + +// Comments start with # and end with the line. +// There must be a space before the #. +char skip_comments(istream &in) { + while(true) { + char c = skip_space(in); + if(c == '#') + while((c = in.peek()) != '\n') in.get(); + else + return c; + } +} + +bool LispTree::read_token(istream &in, string &s) { + char c = skip_comments(in); + + if(is_paren(c)) { + s = in.get(); + return true; + } + + s = ""; + while(true) { + c = in.peek(); + if(c == EOF) return false; + if(isspace(c) || is_paren(c)) break; + s += in.get(); + } + + return true; +} + +LispNode *LispTree::read_node(const vector &tokens, int &i) { + LispNode *node = new LispNode(); + assert(i < len(tokens)); + + string s = tokens[i++]; + if(is_left_paren(s)) { + char left_paren = s[0]; + + if(left_paren == '(') { + assert(i < len(tokens) && !is_paren(tokens[i])); + node->value = tokens[i++]; + } + + while(i < len(tokens) && !is_right_paren(tokens[i])) { + node->children.push_back(read_node(tokens, i)); + } + + assert(i < len(tokens)); + s = tokens[i++]; + assert(s == matching_right_paren(left_paren)); + } + else if(is_right_paren(s)) + assert(false); + else + node->value = s; + + return node; +} + +void LispTree::read(const char *file) { + ifstream in(file); + vector tokens; + string token; + while(read_token(in, token)) { + tokens.push_back(token); + } + int i = 0; + root = read_node(tokens, i); + assert(i == len(tokens)); +} + +void LispTree::print() const { + assert(root); + root->print(0); +} diff --git a/merge_generator/basic/lisp.h b/merge_generator/basic/lisp.h new file mode 100644 index 0000000..089ed1e --- /dev/null +++ b/merge_generator/basic/lisp.h @@ -0,0 +1,33 @@ +#ifndef __LISP_H__ +#define __LISP_H__ + +#include +#include + +using namespace std; + +//////////////////////////////////////////////////////////// + +struct LispNode { + void destroy(); + void print(int ind) const; + + string value; + vector children; +}; + +//////////////////////////////////////////////////////////// + +struct LispTree { + LispTree() : root(NULL) { } + ~LispTree(); + + bool read_token(istream &in, string &s); + LispNode *read_node(const vector &tokens, int &i); + void read(const char *file); + void print() const; + + LispNode *root; +}; + +#endif diff --git a/merge_generator/basic/logging.cc b/merge_generator/basic/logging.cc new file mode 100644 index 0000000..95893d8 --- /dev/null +++ b/merge_generator/basic/logging.cc @@ -0,0 +1,145 @@ +#include "logging.h" +#include "opt.h" +#include "mem.h" + +// The logging output has a tree structure, where each node is a +// line of output, and the depth of a node is its indent level. +// A run is the sequence of children of some node. +// A subset of the lines in the run will get printed. + +//////////////////////////////////////////////////////////// + +void Run::init() { + num_lines = 0; + num_lines_printed = 0; + next_line_to_print = 0; + print_all_lines = false; + timer.start(); +} + +void Run::finish() { + // Make it clear that this run is not printed. + // Otherwise, logss might think its + // parent was printed when it really wasn't. + next_line_to_print = -1; + timer.stop(); +} + +bool Run::new_line() { + bool p = print(); + num_lines++; + if(!p) return false; + + // We're going to print this line. Now decide next line to print. + int ms_per_line = log_info.ms_per_line; + if(num_lines <= 2 || // Print first few lines anyway. + ms_per_line == 0 || // Print everything. + print_all_lines) // Print every line in this run. + next_line_to_print++; + else { + timer.stop(); + if(timer.ms == 0) // No time has elapsed. + next_line_to_print *= 2; // Exponentially increase time between lines. + else + next_line_to_print += max(int((double)num_lines * ms_per_line / timer.ms), 1); + } + + num_lines_printed++; + return true; +} + +//////////////////////////////////////////////////////////// +// Global information about logging. + +LogInfo::LogInfo() { + ms_per_line = 0; //1000; // 1 second + max_ind_level = 3; + + ind_level = 0; + buf = ""; + + runs.resize(128); + timer.start(); +} + +LogInfo::~LogInfo() { + out.flush(); +} + +void LogInfo::init() { + if (log_file.empty()) { + out.open("/dev/stdout"); + } else { + cout << "Logging to " << log_file << endl; + out.open(log_file.c_str()); + } +} + +LogInfo log_info; + +//////////////////////////////////////////////////////////// +// LogTracker:: For tracking functions or blocks. + +void LogTracker::begin(bool print_all_lines) { + if(_ind_within) { + if(log_info.this_run().print()) { + const string &s = descrip.str(); + + _logs(name); + if(s.size() > 0 && name[0]) + lout << ": "; + lout << s; + + lout.flush(); + log_info.buf = " {\n"; // Open the block. + + log_info.child_run().init(); + log_info.child_run().print_all_lines = print_all_lines; + } + else { + log_info.max_ind_level = -log_info.max_ind_level; // Prevent children from outputting. + output_stopped = true; + } + } + + log_info.ind_level++; +} + +LogTracker::~LogTracker() { + log_info.ind_level--; + + if(output_stopped) + log_info.max_ind_level = -log_info.max_ind_level; // Restore indent level. + + if(_ind_within) { + if(log_info.this_run().new_line()) { + // Finish up child level. + log_info.ind_level++; + int n = log_info.this_run().num_omitted(); + if(n > 0) + _logs("... " << n << " lines omitted ...\n"); + log_info.ind_level--; + log_info.child_run().finish(); + + if(log_info.buf[0]) // Nothing was printed, because buf hasn't been emptied. + log_info.buf = ""; // Just pretend we didn't open the block. + else // Something indented was printed. + _logs("}"); // Close the block. + + // Print time + Timer &ct = log_info.child_run().timer; + lout << " [" << ct; + if(log_info.ind_level > 0) { + Timer &tt = log_info.this_run().timer; + tt.stop(); + lout << ", cumulative " << tt; + } + lout << "]\n"; + } + } +} + +// Options for logging. +int _log_info_max_ind_level = opt_define_int_wrap("max-ind-level", &log_info.max_ind_level, log_info.max_ind_level, "Maximum indent level for logging", false); +int _log_info_ms_per_line = opt_define_int_wrap("ms-per-line", &log_info.ms_per_line, log_info.ms_per_line, "Print a line out every this many milliseconds", false); +string _log_info_log_file = opt_define_string_wrap("log", &log_info.log_file, log_info.log_file, "File to write log to (\"\" for stdout)", false); diff --git a/merge_generator/basic/logging.h b/merge_generator/basic/logging.h new file mode 100644 index 0000000..c255fb6 --- /dev/null +++ b/merge_generator/basic/logging.h @@ -0,0 +1,122 @@ +#ifndef __LOGGING_H__ +#define __LOGGING_H__ + +#include "std.h" +#include "mem.h" +#include "timer.h" +#include "indent.h" + +//////////////////////////////////////////////////////////// + +// State associated with a run. +struct Run { + Run() { init(); } + bool print() const { return num_lines == next_line_to_print; } + + int num_omitted() { return num_lines - num_lines_printed; } + bool new_line(); + + void init(); + void finish(); + + int num_lines; // Number of lines that we've gone through so far in this run. + int num_lines_printed; // Number of lines actually printed. + int next_line_to_print; // Next line to be printed (lines are 0-based). + Timer timer; // Keeps track of time spent on this run. + bool print_all_lines; // Whether or not to force the printing of each line. +}; + +//////////////////////////////////////////////////////////// +// Global information about logging. + +struct LogInfo { + LogInfo(); + ~LogInfo(); + + void init(); + void hard_flush() { out.flush(); } + + Run &parent_run() { return runs[ind_level-1]; } + Run &this_run() { return runs[ind_level]; } + Run &child_run() { return runs[ind_level+1]; } + + // Parameters. + int max_ind_level; // Maximum indent level. + int ms_per_line; // Number of milliseconds between consecutive lines of output. + string log_file; + + // State. + ofstream out; + int ind_level; // Current indent level. + const char *buf; // The buffer to be flushed out the next time _logs is called. + vector runs; // Indent level -> state + Timer timer; // Timer that starts at the beginning of the program +}; + +extern LogInfo log_info; + +//////////////////////////////////////////////////////////// + +#define lout (log_info.out) +#define here lout << "HERE " << __FILE__ << ':' << __LINE__ << endl +#define _ind_within (log_info.ind_level <= log_info.max_ind_level) +#define _parent_ind_within (log_info.ind_level-1 <= log_info.max_ind_level) +#define _logs(x) \ + do { lout << log_info.buf << Indent(log_info.ind_level) << x; log_info.buf = ""; } while(0) +#define logs(x) \ + do { \ + if(_ind_within && log_info.this_run().new_line()) { \ + _logs(x << endl); \ + } \ + } while(0) +// Output something if parent outputted something. +// Subtle note: parent must have been a track, not logs, so its run +// information has not been updated yet until it closes. +// Therefore, calling print() on it is valid. +#define logss(x) \ + do { \ + if(_parent_ind_within && log_info.parent_run().print()) { \ + log_info.this_run().new_line(); \ + _logs(x << endl); \ + } \ + } while(0) + +#define LOGS(x) _logs(x << endl) + +//////////////////////////////////////////////////////////// +// For tracking functions or blocks. +struct LogTracker { + LogTracker(const char *name) : b(true), output_stopped(false), name(name) { } + void begin(bool print_all_lines); + ~LogTracker(); + + bool b; // Trick used in track_block to execute the for loop exactly once. + bool output_stopped; + const char *name; + ostringstream descrip; +}; + +#define track(name, x, all) \ + LogTracker _lt(name); \ + (_ind_within && log_info.this_run().print() && _lt.descrip << x), _lt.begin(all) +#define track_block(name, x, all) \ + for(LogTracker _lt(name); \ + _lt.b && ((_ind_within && log_info.this_run().print() && _lt.descrip << x), _lt.begin(all), true); \ + _lt.b = false) + +#define track_foridx(i, n, s, all) \ + foridx(i, n) track_block(s, i << '/' << n, all) +#define track_forvec(i, tx, x, vec, s, all) \ + forvec(i, tx, x, vec) track_block(s, i << '/' << len(vec), all) + +#define init_log \ + log_info.init(); \ + track("main", to_vector(argv, argc), true); \ + logs(now() << " on " << hostname() << " (" << cpu_speed_mhz() << "MHz)"); + +#define prog_status \ + "PROG_STATUS: " << \ + "time = " << log_info.timer.stop() << \ + ", memory = " << Mem(mem_usage()*1024) + +#endif diff --git a/merge_generator/basic/mem-tracker.cc b/merge_generator/basic/mem-tracker.cc new file mode 100644 index 0000000..d57db0b --- /dev/null +++ b/merge_generator/basic/mem-tracker.cc @@ -0,0 +1,53 @@ +#include "mem-tracker.h" +#include "mem.h" + +/* + * Currently, memory tracking is not accurate. + * Alway underestimates. + */ + +//////////////////////////////////////////////////////////// + +int MemTracker::compute_mem_usage(const MemRecord &r) { + switch(r.type) { + list_types(define_case); + default: assert(0); + } + return 0; +} + +int MemTracker::compute_mem_usage() { + int total_mem = 0; + forvec(_, MemRecord &, r, records) { + if(r.type != T_RAWNUMBER) r.mem = compute_mem_usage(r); + total_mem += r.mem; + } + return total_mem; +} + +static bool record_less_than(const MemRecord &r1, const MemRecord &r2) { + return r1.mem > r2.mem; +} + +void MemTracker::report_mem_usage() { + track("report_mem_usage()", "", true); + + int total_mem = compute_mem_usage(); + + sort(records.begin(), records.end(), record_less_than); + + forvec(_, const MemRecord &, r, records) { + logs(type_names[r.type] << ' ' << r.name << ": " << + Mem(r.mem) << " (" << (double)r.mem/total_mem << ')'); + } + logs("Total: " << Mem(total_mem)); +} + +//////////////////////////////////////////////////////////// + +MemTracker mem_tracker; + +const char *MemTracker::type_names[] = { + "?", + list_types(define_str) +}; diff --git a/merge_generator/basic/mem-tracker.h b/merge_generator/basic/mem-tracker.h new file mode 100644 index 0000000..1e0eb1c --- /dev/null +++ b/merge_generator/basic/mem-tracker.h @@ -0,0 +1,132 @@ +#ifndef __MEM_TRACKER_H__ +#define __MEM_TRACKER_H__ + +#include "std.h" +#include "stl-basic.h" +#include "union-set.h" +#include "strdb.h" + +// Currently, memory tracking is not accurate. +// Alway underestimates. + +// Call this function. Don't use anything else. +#define track_mem(x) mem_tracker.add(__STRING(x), x) + +#define list_types(f) \ + f(IntVec) \ + f(IntMat) \ + f(IntIntMap) \ + f(IntDoubleMap) \ + f(IntIntPairMap) \ + f(IntPairDoubleMap) \ + f(IntSet) \ + f(DoubleVec) \ + f(DoubleVecVec) \ + f(StrVec) \ + f(StrIntMap) \ + f(UnionSet) \ + f(StrDB) + +#define prefix_t(type) T_##type, +#define define_str(type) __STRING(type), +#define define_add(type) \ + void add(const char *name, const type &data) { \ + records.push_back(MemRecord(name, T_##type, &data)); \ + } +#define define_case(type) \ + case T_##type: return mem_usage(*((const type *)r.data)); + +enum MemType { T_RAWNUMBER, list_types(prefix_t) }; + +struct MemRecord { + MemRecord(const char *name, int mem) : + name(name), type(T_RAWNUMBER), data(NULL), mem(mem) { } + MemRecord(const char *name, MemType type, const void *data) : + name(name), type(type), data(data), mem(0) { } + string name; + MemType type; + const void *data; + int mem; +}; + +// Track amount of memory used. +class MemTracker { +public: + static const char *type_names[]; + + list_types(define_add) + + void add(const char *name, int mem) { + records.push_back(MemRecord(name, mem)); + } + + int compute_mem_usage(const MemRecord &r); + int compute_mem_usage(); + void report_mem_usage(); + +private: + vector records; +}; + +extern MemTracker mem_tracker; + +//////////////////////////////////////////////////////////// +// Various mem_usage() functions on various data types. + +template int mem_usage(const vector< vector< vector< vector > > > &mat) { // matrix + int mem = 0; + foridx(i, len(mat)) { + foridx(j, len(mat[i])) { + foridx(k, len(mat[i][j])) + mem += len(mat[i][j][k]) * sizeof(T); + mem += len(mat[i][j]) * sizeof(vector); + } + mem += len(mat[i]) * sizeof(vector); + } + mem += len(mat) * sizeof(vector); + return mem; +} + +template int mem_usage(const vector< vector< vector > > &mat) { // matrix + int mem = 0; + foridx(i, len(mat)) { + foridx(j, len(mat[i])) + mem += len(mat[i][j]) * sizeof(T); + mem += len(mat[i]) * sizeof(vector); + } + mem += len(mat) * sizeof(vector); + return mem; +} + +template int mem_usage(const vector< vector > &mat) { // matrix + int mem = 0; + foridx(i, len(mat)) + mem += len(mat[i]) * sizeof(T); + mem += len(mat) * sizeof(vector); + return mem; +} + +template int mem_usage(const vector &vec) { // vector + return len(vec) * sizeof(T); +} + +template int mem_usage(const unordered_set &set) { // hash_set + return (int)set.bucket_count()*4 + len(set)*(sizeof(T)+sizeof(void *)); +} + +template int mem_usage(const unordered_map &map) { // hash_map + return (int)map.bucket_count()*4 + len(map)*(sizeof(Tx)+sizeof(Ty)+sizeof(void *)); +} + +inline int mem_usage(const UnionSet &u) { // UnionSet + return mem_usage(u.parent); +} + +inline int mem_usage(const StrDB &db) { // StrDB + int mem = mem_usage(db.s2i) + mem_usage(db.i2s); + foridx(i, len(db)) + mem += (strlen(db[i])+1) * sizeof(char); + return mem; +} + +#endif diff --git a/merge_generator/basic/mem.h b/merge_generator/basic/mem.h new file mode 100644 index 0000000..6fd3af5 --- /dev/null +++ b/merge_generator/basic/mem.h @@ -0,0 +1,14 @@ +#ifndef __MEM_H__ +#define __MEM_H__ + +// Takes memory is in bytes and formats it nicely +struct Mem { Mem(int mem) : mem(mem) { } int mem; }; +inline ostream &operator<<(ostream &out, const Mem &m) { + unsigned int mem = m.mem; + if(mem < 1024) out << mem; + else if(mem < 1024*1024) out << mem/1024 << 'K'; + else out << mem/(1024*1024) << 'M'; + return out; +} + +#endif diff --git a/merge_generator/basic/multi-ostream.cc b/merge_generator/basic/multi-ostream.cc new file mode 100644 index 0000000..ef6e7b1 --- /dev/null +++ b/merge_generator/basic/multi-ostream.cc @@ -0,0 +1,61 @@ +#include "multi-ostream.h" + +/* + * Create a multi_ostream, and you can add many files or any ostream objects + * to it. The output sent to the multi_ostream will be redirected to the many + * destinations. + * Useful for logging to a file and stdout. + */ + +#include +#include +#include + +using namespace std; + +multi_buf::~multi_buf() { + flush(); + for(int i = 0; i < (int)infos.size(); i++) + infos[i].destroy(); +} + +void multi_buf::add(ostream *out, bool own, bool hard) { + infos.push_back(ostream_info(out, own, hard)); +} + +void multi_buf::flush() { + for(int i = 0; i < (int)infos.size(); i++) { + ostream_info &info = infos[i]; + info.out->write(buf, buf_i); + info.out->flush(); + } + buf_i = 0; +} + +void multi_buf::hard_flush() { + for(int i = 0; i < (int)infos.size(); i++) { + ostream_info &info = infos[i]; + info.out->write(buf, buf_i); + if(info.hard) + ((hard_ofstream *)info.out)->hard_flush(); + else + info.out->flush(); + } + buf_i = 0; +} + +int multi_buf::overflow(int ch) { + buf[buf_i++] = ch; + if(buf_i == sizeof(buf) || ch == '\n') flush(); + return ch; +} + +ostream &multi_ostream::flush() { + sbuf.flush(); + return *this; +} + +ostream &multi_ostream::hard_flush() { + sbuf.hard_flush(); + return *this; +} diff --git a/merge_generator/basic/multi-ostream.h b/merge_generator/basic/multi-ostream.h new file mode 100644 index 0000000..61f0d25 --- /dev/null +++ b/merge_generator/basic/multi-ostream.h @@ -0,0 +1,67 @@ +#ifndef __MULTI_OSTREAM_H__ +#define __MULTI_OSTREAM_H__ + +/* + * Create a multi_ostream, and you can add many files or any ostream objects + * to it. The output sent to the multi_ostream will be redirected to the many + * destinations. + * Useful for logging to a file and stdout. + */ + +#include +#include +#include + +#include "hard-ofstream.h" + +using namespace std; + +struct ostream_info { + ostream_info(ostream *out, bool own, bool hard) : out(out), own(own), hard(hard) { } + ostream *out; + bool own; // Whether we own the ostream and should destroy it at the end. + bool hard; // Whether this is a hard_ofstream. + + void destroy() { if(own) delete out; } +}; + +class multi_buf : public streambuf { +public: + multi_buf() : buf_i(0) { } + ~multi_buf(); + + void flush(); + void hard_flush(); + + void add(ostream *out, bool own, bool hard); + void remove_last() { flush(); infos.back().destroy(); infos.pop_back(); } + +protected: + virtual int overflow(int ch); + +private: + vector infos; + char buf[16384]; + int buf_i; +}; + +class multi_ostream : public basic_ostream > { +public: + multi_ostream() : basic_ostream >(&sbuf) { } + + virtual ostream &flush(); + virtual ostream &hard_flush(); + + void add(const char *file, bool hard = false) { + ostream *out = hard ? new hard_ofstream(file) : new ofstream(file); + sbuf.add(out, true, hard); + } + void add(ostream *out) { sbuf.add(out, false, false); } + + void remove_last() { sbuf.remove_last(); } + +private: + multi_buf sbuf; +}; + +#endif diff --git a/merge_generator/basic/opt.cc b/merge_generator/basic/opt.cc new file mode 100644 index 0000000..b38f692 --- /dev/null +++ b/merge_generator/basic/opt.cc @@ -0,0 +1,189 @@ +#include "opt.h" +#include "std.h" +#include "logging.h" +#include + +//////////////////////////////////////////////////////////////////////// +// command-line arguments + +void GetOpt::AddOpt(const string &name, bool has_arg) { + opts.push_back(pair(name, has_arg)); +} + +void GetOpt::Parse(int argc, char *argv[]) { + option *opt_list = new option[opts.size()+1]; + for(int i = 0; i <= (int)opts.size(); i++) { + option *o = &opt_list[i]; + if(i < (int)opts.size()) { + o->name = opts[i].first.c_str(); + o->has_arg = opts[i].second; + //printf("N %s\n", o->name); + } + else { + o->name = NULL; + o->has_arg = 0; + } + o->flag = NULL; + o->val = 0; + } + + int i; + + values.clear(); + values.resize(opts.size()); + while(true) { + int status = getopt_long(argc, argv, "", opt_list, &i); + if(status == -1) break; + assert(status == 0); + //debug("%d %s -> %s\n", i, opt_list[i].name, optarg); + // put a 1 to signify that the argument exists + values[i] = optarg ? optarg : "1"; + } + + delete [] opt_list; +} + +int GetOpt::Lookup(const string &name) const { + for(int i = 0; i < (int)opts.size(); i++) { + if(opts[i].first == name) return i; + } + return -1; +} + +string GetOpt::Get(const string &name, const string &default_value) const { + int i = Lookup(name); + return i != -1 && !values[i].empty() ? values[i] : default_value; +} + +string GetOpt::Get(const string &name) const { + string x = Get(name, ""); + if(x.empty()) { + fprintf(stderr, "Missing required parameter `%s'.\n", name.c_str()); + exit(1); + } + return x; +} + +bool GetOpt::Exists(const string &name) const { + return !Get(name, "").empty(); +} + +int GetOpt::GetInt(const string &name) const { + int x; + int r = sscanf(Get(name).c_str(), "%d", &x); + assert(r == 1); + return x; +} + +int GetOpt::GetInt(const string &name, int default_value) const { + return Exists(name) ? GetInt(name) : default_value; +} + +double GetOpt::GetDouble(const string &name) const { + double x; + int r = sscanf(Get(name).c_str(), "%lf", &x); + assert(r == 1); + return x; +} + +double GetOpt::GetDouble(const string &name, double default_value) const { + return Exists(name) ? GetDouble(name) : default_value; +} + +//////////////////////////////////////////////////////////// + +void process_opt(int argc, char *argv[]) { + GetOpt opt; + + // set up GetOpt to parse + for(int i = 0; i < (int)bool_opts.size(); i++) { + opt.AddOpt(bool_opts[i].name, false); + opt.AddOpt("no" + bool_opts[i].name, false); + } + for(int i = 0; i < (int)int_opts.size(); i++) + opt.AddOpt(int_opts[i].name, true); + for(int i = 0; i < (int)double_opts.size(); i++) + opt.AddOpt(double_opts[i].name, true); + for(int i = 0; i < (int)string_opts.size(); i++) + opt.AddOpt(string_opts[i].name, true); + opt.AddOpt("help", false); + + // parse + opt.Parse(argc, argv); + + // print help if called for + if(opt.Exists("help") || !opt.Exists("text")) { + printf("usage: %s\n", argv[0]); + for(int i = 0; i < (int)bool_opts.size(); i++) { + const OptInfo &o = bool_opts[i]; + printf(" %c%-20s: %s", " *"[o.required], o.name.c_str(), o.msg.c_str()); + if(!o.required) printf(" [%s]", *(o.var) ? "true" : "false"); + printf("\n"); + } + for(int i = 0; i < (int)int_opts.size(); i++) { + const OptInfo &o = int_opts[i]; + printf(" %c%-13s : %s", " *"[o.required], o.name.c_str(), o.msg.c_str()); + if(!o.required) printf(" [%d]", *(o.var)); + printf("\n"); + } + for(int i = 0; i < (int)double_opts.size(); i++) { + const OptInfo &o = double_opts[i]; + printf(" %c%-13s : %s", " *"[o.required], o.name.c_str(), o.msg.c_str()); + if(!o.required) printf(" [%f]", *(o.var)); + printf("\n"); + } + for(int i = 0; i < (int)string_opts.size(); i++) { + const OptInfo &o = string_opts[i]; + printf(" %c%-13s : %s", " *"[o.required], o.name.c_str(), o.msg.c_str()); + if(!o.required) printf(" [%s]", (o.var)->c_str()); + printf("\n"); + } + exit(1); + } + + // retrieve data; store the variables + for(int i = 0; i < (int)bool_opts.size(); i++) { + const OptInfo &o = bool_opts[i]; + bool yes = opt.Exists(o.name); + bool no = opt.Exists("no" + o.name); + assert(!o.required || (yes || no)); + assert(!yes || !no); + if(yes) *(o.var) = true; + if(no) *(o.var) = false; + } + for(int i = 0; i < (int)int_opts.size(); i++) { + const OptInfo &o = int_opts[i]; + *(o.var) = o.required ? opt.GetInt(o.name) : opt.GetInt(o.name, *(o.var)); + } + for(int i = 0; i < (int)double_opts.size(); i++) { + const OptInfo &o = double_opts[i]; + *(o.var) = o.required ? opt.GetDouble(o.name) : opt.GetDouble(o.name, *(o.var)); + } + for(int i = 0; i < (int)string_opts.size(); i++) { + const OptInfo &o = string_opts[i]; + *(o.var) = o.required ? opt.Get(o.name) : opt.Get(o.name, *(o.var)); + } +} + +void init_opt(int argc, char *argv[]) { + process_opt(argc, argv); + srand(rand_seed); +} + +void print_opts() { + track("print_opts()", "", true); + forvec(_, const OptInfo &, o, bool_opts) + logs(o.name << " = " << (*o.var ? "true" : "false")); + forvec(_, const OptInfo &, o, int_opts) + logs(o.name << " = " << *o.var); + forvec(_, const OptInfo &, o, double_opts) + logs(o.name << " = " << *o.var); + forvec(_, const OptInfo &, o, string_opts) + logs(o.name << " = " << *o.var); +} + +//////////////////////////////////////////////////////////// +// Pre defined options. + +// allow user to specify a comment always, so some arbitrary description +// of this program execution can be embedded in the command-line diff --git a/merge_generator/basic/opt.h b/merge_generator/basic/opt.h new file mode 100644 index 0000000..aed427c --- /dev/null +++ b/merge_generator/basic/opt.h @@ -0,0 +1,100 @@ +#ifndef __OPT_H__ +#define __OPT_H__ + +#include +#include +#include + +using namespace std; + +// First thing to call in main(). +void init_opt(int argc, char *argv[]); + +//////////////////////////////////////////////////////////////////////// +// command-line arguments + +class GetOpt { +public: + GetOpt() { } + + void AddOpt(const string &name, bool has_arg); + void Parse(int argc, char *argv[]); + int Lookup(const string &name) const; + + bool Exists(const string &name) const; + string Get(const string &name, const string &default_value) const; + string Get(const string &name) const; + int GetInt(const string &name) const; + int GetInt(const string &name, int default_value) const; + double GetDouble(const string &name) const; + double GetDouble(const string &name, double default_value) const; + +private: + vector< pair > opts; + vector values; +}; + +template struct OptInfo { + OptInfo(const string &name, T *var, const string &msg, bool required) + : name(name), var(var), msg(msg), required(required) { } + + string name; + T *var; // location of the variable that stores this value + string msg; + bool required; +}; + +extern vector< OptInfo > bool_opts; +extern vector< OptInfo > int_opts; +extern vector< OptInfo > double_opts; +extern vector< OptInfo > string_opts; + +//////////////////////////////////////////////////////////// + +// two versions: in one, option is required +#define opt_define_bool_req(var, name, msg) \ + bool var = opt_define_bool_wrap(name, &var, false, msg, true) +#define opt_define_bool(var, name, val, msg) \ + bool var = opt_define_bool_wrap(name, &var, val, msg, false) +#define opt_define_int_req(var, name, msg) \ + int var = opt_define_int_wrap(name, &var, 0, msg, true) +#define opt_define_int(var, name, val, msg) \ + int var = opt_define_int_wrap(name, &var, val, msg, false) +#define opt_define_double_req(var, name, msg) \ + double var = opt_define_double_wrap(name, &var, 0.0, msg, true) +#define opt_define_double(var, name, val, msg) \ + double var = opt_define_double_wrap(name, &var, val, msg, false) +#define opt_define_string_req(var, name, msg) \ + string var = opt_define_string_wrap(name, &var, "", msg, true) +#define opt_define_string(var, name, val, msg) \ + string var = opt_define_string_wrap(name, &var, val, msg, false) + +inline bool opt_define_bool_wrap(const string &name, bool *var, bool val, const string &msg, bool required) { + bool_opts.push_back(OptInfo(name, var, msg, required)); + return val; +} + +inline int opt_define_int_wrap(const string &name, int *var, int val, const string &msg, bool required) { + //printf("HELLO %s\n", name.c_str()); + int_opts.push_back(OptInfo(name, var, msg, required)); + //printf("N %d\n", (int)int_opts.size()); + return val; +} +inline double opt_define_double_wrap(const string &name, double *var, double val, const string &msg, bool required) { + double_opts.push_back(OptInfo(name, var, msg, required)); + return val; +} +inline string opt_define_string_wrap(const string &name, string *var, const string &val, const string &msg, bool required) { + string_opts.push_back(OptInfo(name, var, msg, required)); + return val; +} + +//////////////////////////////////////////////////////////// + +void print_opts(); + +extern int rand_seed; +extern string comment; +extern int initC; + +#endif diff --git a/merge_generator/basic/pipe.h b/merge_generator/basic/pipe.h new file mode 100644 index 0000000..3fed44a --- /dev/null +++ b/merge_generator/basic/pipe.h @@ -0,0 +1,46 @@ +/* +Execute another application, piping input to and from its stdin and stdout. +*/ + +#ifndef __PIPE_H__ +#define __PIPE_H__ + +typedef pair FILEPair; + +// Return input and output file pointers. +// User is responsible for closing them. +// May have to close out before reading from in. +FILEPair create_pipe(char *const cmd[]) { + int p2c_fds[2], c2p_fds[2]; + + assert(pipe(p2c_fds) == 0); + assert(pipe(c2p_fds) == 0); + + int pid = fork(); + assert(pid != -1); + if(pid != 0) { // parent + close(p2c_fds[0]); + close(c2p_fds[1]); + + FILE *in = fdopen(c2p_fds[0], "r"); + FILE *out = fdopen(p2c_fds[1], "w"); + + assert(in && out); + + return FILEPair(in, out); + } + else { // child + close(p2c_fds[1]); + close(c2p_fds[0]); + + assert(dup2(p2c_fds[0], fileno(stdin)) != -1); + assert(dup2(c2p_fds[1], fileno(stdout)) != -1); + execvp(cmd[0], cmd); + + // Execution should not reach here. + assert(0); + return FILEPair(NULL, NULL); + } +} + +#endif diff --git a/merge_generator/basic/prob-utils.cc b/merge_generator/basic/prob-utils.cc new file mode 100644 index 0000000..f6d807c --- /dev/null +++ b/merge_generator/basic/prob-utils.cc @@ -0,0 +1,75 @@ +#include "prob-utils.h" + +double rand_gaussian(double mean, double var) { + // Use the Box-Muller Transformation + // if x_1 and x_2 are independent uniform [0, 1], + // then sqrt(-2 ln x_1) * cos(2*pi*x_2) is Gaussian with mean 0 and variance 1 + double x1 = rand_double(), x2 = rand_double(); + double z = sqrt(-2*log(x1))*cos(2*M_PI*x2); + return z * sqrt(var) + mean; +} + +// The probability of heads is p. +// Throw n coin tosses. +// Return number of heads. +int rand_binomial(int n, double p) { + int k = 0; + while(n--) k += rand_double() < p; + return k; +} + +inline double factorial(int n) { + double ans = 1; + while(n > 1) ans *= n--; + return ans; +} + +inline double choose(int n, int k) { + if(n-k < k) k = n-k; + double ans = 1; + for(int i = 0; i < k; i++) ans *= n-i; + ans /= factorial(k); + return ans; +} + +double binomial_prob(int n, int k, double p) { + return choose(n, k) * pow(p, k) * pow(1-p, n-k); +} + +int rand_index(const fvector &probs) { + double v = rand_double(); + double sum = 0; + foridx(i, len(probs)) { + sum += probs[i]; + if(v < sum) return i; + } + assert(0); +} + +void norm_distrib(fvector &vec) { + double sum = 0; + foridx(i, len(vec)) sum += vec[i]; + foridx(i, len(vec)) vec[i] /= sum; +} + +void norm_distrib(fmatrix &mat, int c) { + double sum = 0; + foridx(r, len(mat)) sum += mat[r][c]; + foridx(r, len(mat)) mat[r][c] /= sum; +} + +void rand_distrib(fvector &probs, int n) { + probs.resize(n); + foridx(i, n) probs[i] = rand(); + norm_distrib(probs); +} + +IntVec rand_permutation(int n) { + IntVec perm(n); + foridx(i, n) perm[i] = i; + foridx(i, n) { + int j = mrand(i, n); + int t = perm[i]; perm[i] = perm[j]; perm[j] = t; + } + return perm; +} diff --git a/merge_generator/basic/prob-utils.h b/merge_generator/basic/prob-utils.h new file mode 100644 index 0000000..5333c6f --- /dev/null +++ b/merge_generator/basic/prob-utils.h @@ -0,0 +1,19 @@ +#ifndef __PROB_UTILS__ +#define __PROB_UTILS__ + +#include "stl-basic.h" + +int rand_binomial(int n, double p); +int rand_index(const fvector &probs); +double rand_gaussian(double mean, double var); + +inline double factorial(int n); +inline double choose(int n, int k); +double binomial_prob(int n, int k, double p); + +void norm_distrib(fvector &vec); +void norm_distrib(fmatrix &mat, int c); +void rand_distrib(fvector &probs, int n); +IntVec rand_permutation(int n); + +#endif diff --git a/merge_generator/basic/stats.cc b/merge_generator/basic/stats.cc new file mode 100644 index 0000000..7dd7622 --- /dev/null +++ b/merge_generator/basic/stats.cc @@ -0,0 +1 @@ +#include "stats.h" diff --git a/merge_generator/basic/stats.h b/merge_generator/basic/stats.h new file mode 100644 index 0000000..db8c444 --- /dev/null +++ b/merge_generator/basic/stats.h @@ -0,0 +1,71 @@ +#ifndef __STATS_H__ +#define __STATS_H__ + +#include "std.h" +#include "stl-basic.h" +#define DBL_MAX 1e300 +#define DBL_MIN (-1e300) + +struct StatFig { + StatFig() { clear(); } + StatFig(double sum, int n) : sum(sum), n(n) { } + virtual ~StatFig() { } + + static double F1(const StatFig &fig1, const StatFig &fig2) { + if(fig1.n == 0 || fig2.n == 0) return 0; + return 2*fig1.val()*fig2.val() / (fig1.val()+fig2.val()); + } + + void add() { add(1); } + virtual void add(double v) { sum += v; n++; } + virtual void clear() { sum = n = 0; } + int size() const { return n; } + double val() const { return sum / n; } + double mean() const { return sum / n; } + double sum; + int n; +}; + +inline ostream &operator<<(ostream &out, const StatFig &fig) { + return out << fig.sum << '/' << fig.n << '=' << fig.val(); +} + +//////////////////////////////////////////////////////////// +// Stores the min and the amx + +struct BigStatFig : public StatFig { + BigStatFig() { clear(); } + void add(double v) { if(v < min) min = v; if(v > max) max = v; StatFig::add(v); } + void clear() { min = DBL_MAX; max = DBL_MIN; StatFig::clear(); } + double min, max; +}; + +inline ostream &operator<<(ostream &out, const BigStatFig &fig) { + return out << fig.n << ':' << fig.min << "/<< " << fig.val() << " >>/" << fig.max; +} + +//////////////////////////////////////////////////////////// +// Stores the standard deviation (and all points) + +struct FullStatFig : public BigStatFig { + FullStatFig() { clear(); } + virtual ~FullStatFig() { } + void add(double v) { data.push_back(v); BigStatFig::add(v); } + void clear() { data.clear(); BigStatFig::clear(); } + + double variance() const { + double var = 0, mean = val(); + forvec(_, double, v, data) var += sq(v-mean); + var /= n; + return var; + } + double stddev() const { return sqrt(variance()); } + + DoubleVec data; +}; + +inline ostream &operator<<(ostream &out, const FullStatFig &fig) { + return out << (BigStatFig)fig << '~' << fig.stddev(); +} + +#endif diff --git a/merge_generator/basic/std.cc b/merge_generator/basic/std.cc new file mode 100644 index 0000000..a1f70eb --- /dev/null +++ b/merge_generator/basic/std.cc @@ -0,0 +1,111 @@ +#include +#include +#include +#include "std.h" +#include "str.h" +#include "timer.h" + +// Return the current date/time. +string now() { + time_t t = time(NULL); + return substr(ctime(&t), 0, -1); +} + +string hostname() { + char buf[1024]; + gethostname(buf, sizeof(buf)); + return buf; +} + +// Return the amount of memory (kB) used by this process +int mem_usage() { + ifstream in("/proc/self/status"); + if(!in) return 0; + char buf[1024]; + static const char *key = "VmRSS"; + + while(in.getline(buf, sizeof(buf))) { + if(strncmp(buf, key, strlen(key)) != 0) continue; + char *s = strchr(buf, ':'); + if(!s) return 0; + int x; + sscanf(s+1, "%d", &x); + return x; + } + return -1; +} + +// Return whether the file exists. +bool file_exists(const char *file) { + return access(file, F_OK) == 0; +} + +// Create an empty file. Return success. +bool create_file(const char *file) { + ofstream out(file); + if(!out) return false; + out.close(); + return true; +} + +time_t file_modified_time(const char *file) { + struct stat stat_buf; + if(stat(file, &stat_buf) != 0) + return 0; + return stat_buf.st_mtime; +} + +// Return the cpu speed in MHz. +int cpu_speed_mhz() { + ifstream in("/proc/cpuinfo"); + if(!in) return 0; + char buf[1024]; + static const char *key = "cpu MHz"; + + while(in.getline(buf, sizeof(buf))) { + if(strncmp(buf, key, strlen(key)) != 0) continue; + char *s = strchr(buf, ':'); + if(!s) return 0; + double x; + sscanf(s+1, "%lf", &x); + return (int)x; + } + return 0; +} + +// "file" -> "file" +// "dir/file" -> "file" +string strip_dir(string s) { + return substr(s, s.rfind('/')+1); +} + +// "file" -> "file" +// "dir/file" -> "dir" +string get_dir(string s) { + int i = s.rfind('/'); + return i == -1 ? "." : substr(s, 0, s.rfind('/')); +} + +// "base" -> "base" +// "base.ext" -> "base" +string file_base(string s) { + int i = s.rfind('.'); + return i == -1 ? s : substr(s, 0, i); +} + +bool get_files_in_dir(string dirname, bool fullpath, vector &files) { + DIR *dir = opendir(dirname.c_str()); + if(!dir) return false; + while(true) { + dirent *ent = readdir(dir); + if(!ent) break; + // For some reason, sometimes files show up as d_type == DT_UNKNOWN, I + // think due to AFS issues + //cout << "FFF " << ent->d_name << ' ' << (int)ent->d_type << endl; + if(ent->d_type != DT_DIR) { + files.push_back((fullpath ? dirname+"/" : string()) + ent->d_name); + } + } + closedir(dir); + return true; +} diff --git a/merge_generator/basic/std.h b/merge_generator/basic/std.h new file mode 100644 index 0000000..0f970b1 --- /dev/null +++ b/merge_generator/basic/std.h @@ -0,0 +1,110 @@ +#ifndef __STD_H__ +#define __STD_H__ + +#include +#include +#include +//#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +//////////////////////////////////////////////////////////// + +#define len(vec) (int)(vec).size() +#define sq(x) ((x)*(x)) + +// For loop sugar. This is such a hack! +#define foridx(i, n) for(int i = 0; i < n; i++) +#define forvec(i, tx, x, vec) for(int i = 0, _##i = 0; i < len(vec); i++) \ + for(tx x = (vec)[i]; i == _##i; _##i++) +#define formap(tx, x, ty, y, t, map) forstl(t, _##x##y, map) _mapvars(tx, x, ty, y) +#define forcmap(tx, x, ty, y, t, map) forcstl(t, _##x##y, map) _mapvars(tx, x, ty, y) +#define forstl(t, x, container) for(t::iterator x = (container).begin(); x != (container).end(); x++) +#define forcstl(t, x, container) for(t::const_iterator x = (container).begin(); x != (container).end(); x++) +#define _mapvars(tx, x, ty, y) for(tx x = _##x##y->first, *_##x = &x; _##x; _##x = NULL) \ + for(ty y = _##x##y->second, *_##y = &y; _##y; _##y = NULL) + +//////////////////////////////////////////////////////////// +// Generate random numbers. + +inline int mrand(int a) { return rand() % a; } +inline int mrand(int a, int b) { return rand() % (b-a) + a; } +inline double rand_double() { + static const int BASE = 100000; + return (double)(rand()%BASE)/BASE; +} + +//////////////////////////////////////////////////////////// +// Floating point stuff. + +const double TOL = 1e-10; + +inline bool flt(double u, double v) { return u + TOL < v; } +inline bool fgt(double u, double v) { return u - TOL > v; } + +// Comparing floating point numbers. +inline bool feq(double u, double v, double tol = TOL) { return fabs(u-v) < tol; } + +template inline int sign(T u) { + if(u < 0) return -1; + if(u > 0) return 1; + return 0; +} + +#define assert_feq(u, v) do { _assert_feq(u, v, __FILE__, __LINE__); } while(0); +#define assert_feq2(u, v, tol) do { _assert_feq(u, v, tol, __FILE__, __LINE__); } while(0); +#define assert_fneq(u, v) do { _assert_fneq(u, v, __FILE__, __LINE__); } while(0); +inline void _assert_feq(double u, double v, const char *file, int line) { + if(!feq(u, v)) { printf("At %s:%d, %f != %f\n", file, line, u, v); assert(0); } +} +inline void _assert_feq(double u, double v, double tol, const char *file, int line) { + if(!feq(u, v, tol)) { printf("At %s:%d, %f != %f\n", file, line, u, v); assert(0); } +} +inline void _assert_fneq(double u, double v, const char *file, int line) { + if(feq(u, v)) { printf("At %s:%d, %f == %f\n", file, line, u, v); assert(0); } +} +#define assert_eq(u, v) do { _assert_eq(u, v, __STRING(u), __STRING(v), __FILE__, __LINE__); } while(0) +template inline void _assert_eq(const T &u, const T &v, const char *us, const char *vs, const char *file, int line) { + if(u != v) { + cout << "At " << file << ':' << line << ", " << + us << '(' << u << ')' << " != " << + vs << '(' << v << ')' << endl; + assert(0); + } +} + +#define assert2(x, reason) \ + do { \ + if(!(x)) { \ + cout << "\nFAILURE REASON: " << reason << endl; \ + assert(x); \ + } \ + } while(0) + +string now(); +string hostname(); +int cpu_speed_mhz(); +int mem_usage(); // in kB + +bool create_file(const char *file); +bool file_exists(const char *file); +time_t file_modified_time(const char *file); + +string strip_dir(string s); +string get_dir(string s); +string file_base(string s); +bool get_files_in_dir(string dirname, bool fullpath, vector &files); + +#endif diff --git a/merge_generator/basic/stl-basic.cc b/merge_generator/basic/stl-basic.cc new file mode 100644 index 0000000..1aaa7bd --- /dev/null +++ b/merge_generator/basic/stl-basic.cc @@ -0,0 +1 @@ +#include "stl-basic.h" diff --git a/merge_generator/basic/stl-basic.h b/merge_generator/basic/stl-basic.h new file mode 100644 index 0000000..6ba8410 --- /dev/null +++ b/merge_generator/basic/stl-basic.h @@ -0,0 +1,113 @@ +#ifndef __STL_BASIC_H__ +#define __STL_BASIC_H__ + +#include "std.h" +#include "city.h" + +//////////////////////////////////////////////////////////// + +typedef double real; +//typedef float real; + +typedef pair IntPair; +typedef pair IntDouble; +typedef pair DoubleInt; +typedef pair DoublePair; +typedef vector IntPairVec; +typedef vector DoubleIntVec; +typedef vector BoolVec; +typedef vector IntVec; +typedef vector StringVec; +typedef vector IntMat; +typedef vector IntVecVec; +typedef vector IntVecVecVec; +typedef vector IntVecVecVecVec; +typedef vector DoubleVec; +typedef vector DoubleVecVec; +typedef vector DoubleVecVecVec; +typedef vector DoubleVecVecVecVec; +typedef vector IntDoubleVec; +typedef vector IntDoubleVecVec; +typedef vector IntDoubleVecVecVec; +typedef vector IntDoubleVecVecVecVec; + +typedef IntVec ivector; +typedef DoubleVec fvector; +typedef DoubleVecVec fmatrix; + +//////////////////////////////////////////////////////////// + +struct vector_eq { + bool operator()(const IntVec &v1, const IntVec &v2) const { + return v1 == v2; + } +}; +struct vector_hf { + size_t operator()(const IntVec &v) const { + return CityHash64(reinterpret_cast(&v[0]), sizeof(int) * v.size()); +#if 0 + int h = 0; + foridx(i, len(v)) + h = (h<<4)^(h>>28)^v[i]; + return h; +#endif + } +}; + +struct pair_eq { + bool operator()(const IntPair &p1, const IntPair &p2) const { + return p1 == p2; + } +}; +struct pair_hf { + size_t operator()(const IntPair &p) const { + return (p.first<<4)^(p.first>>28) ^ p.second; + } +}; + +struct str_eq { + bool operator()(const char *s1, const char *s2) const { + return strcmp(s1, s2) == 0; + } +}; +struct str_hf { + size_t operator()(const char *s) const { + return CityHash64(s, strlen(s)); + } +}; + +struct string_eq { + bool operator()(const string &s1, const string &s2) const { + return s1 == s2; + } +}; +struct string_hf { + size_t operator()(const string &s) const { + return CityHash64(s.c_str(), s.size()); + } +}; + +//////////////////////////////////////////////////////////// + +typedef unordered_set IntSet; +typedef unordered_set IntPairSet; +typedef unordered_set IntVecSet; +typedef unordered_map IntVecDoubleMap; +typedef unordered_map IntVecIntMap; +typedef unordered_map IntIntMap; +typedef unordered_map IntDoubleMap; +typedef unordered_map IntIntPairMap; +typedef unordered_map IntIntVecMap; +typedef unordered_map IntIntIntMapMap; +typedef unordered_map IntPairIntMap; +typedef unordered_map IntPairDoubleMap; +typedef unordered_map IntPairDoubleVecMap; +typedef unordered_map IntVecIntVecMap; +typedef unordered_map IntVecDoubleVecMap; +typedef vector IntIntMapVec; + +typedef vector StrVec; +typedef unordered_map StrIntMap; +typedef unordered_map StrStrMap; + +#endif diff --git a/merge_generator/basic/stl-utils.cc b/merge_generator/basic/stl-utils.cc new file mode 100644 index 0000000..f6d2fbf --- /dev/null +++ b/merge_generator/basic/stl-utils.cc @@ -0,0 +1 @@ +#include "stl-utils.h" diff --git a/merge_generator/basic/stl-utils.h b/merge_generator/basic/stl-utils.h new file mode 100644 index 0000000..14afe42 --- /dev/null +++ b/merge_generator/basic/stl-utils.h @@ -0,0 +1,232 @@ +#ifndef __STL_UTILS__ +#define __STL_UTILS__ + +#include "stl-basic.h" +#include + +#define contains(X, x) ((X).find(x) != (X).end()) + +inline void improve(DoubleInt &x, const DoubleInt &y) { + if(y.first > x.first) x = y; // Bigger is better. +} + +template inline void improve(DoubleInt &x, const DoubleInt &y, Compare compare) { + if(compare(y.first, x.first)) x = y; +} + +// Free up the memory in a vector or hash_map. +template void destroy(T &obj) { + T empty_obj; + obj.swap(empty_obj); +} + +template int index_of(const vector &vec, const T &x, int i0 = 0) { + for(int i = i0; i < len(vec); i++) + if(vec[i] == x) return i; + return -1; +} + +template int count_of(const vector &vec, const T &x) { + int n = 0; + forvec(_, const T &, y, vec) + if(x == y) n++; + return n; +} + +// Get vec[i], but if i is out of range, expand the vector and fill +// everything with x. +template T &expand_get(vector &vec, int i, const T &x) { + int n = len(vec); + if(i >= n) { + vec.resize(i+1); + for(int ii = n; ii <= i; ii++) vec[ii] = x; + } + return vec[i]; +} +template T &expand_get(vector< vector > &mat, int i, int j, const T &x) { + int n = len(mat); + if(i >= n) mat.resize(i+1); + return expand_get(mat[i], j, x); +} +template T &expand_get(vector< vector< vector > > &mat, int i, int j, int k, const T &x) { + int n = len(mat); + if(i >= n) mat.resize(i+1); + return expand_get(mat[i], j, k, x); +} + +// Assuming this vector/matrix will not grow any more, +// we can safely call compact to reduce the memory usage. +// This is only effective after deletions. +// This isn't necessary if we haven't actually touched +// the memory past size (i.e., we didn't have a bigger +// structure). +template void vector_compact(vector &vec) { + vector new_vec(len(vec)); + new_vec = vec; + vec.swap(new_vec); +} +template void matrix_compact(vector< vector > &mat) { + vector< vector > new_mat(len(mat)); + foridx(i, len(mat)) compact(mat[i]); + new_mat = mat; + mat.swap(new_mat); +} + +// Append to a vector and return the value type. +template inline T &push_back(vector &vec, const T &x = T()) { + vec.push_back(x); + return vec[len(vec)-1]; +} + +template inline void matrix_resize(vector< vector > &mat, int nr, int nc) { + mat.resize(nr); + foridx(r, nr) mat[r].resize(nc); +} + +template inline void matrix_resize(vector< vector< vector > > &mat, int n1, int n2, int n3) { + mat.resize(n1); + foridx(i, n1) { + mat[i].resize(n2); + foridx(j, n2) + mat[i][j].resize(n3); + } +} + +template inline vector< vector > new_matrix(int nr, int nc, T v) { + vector< vector > mat; + mat.resize(nr); + foridx(r, nr) { + mat[r].resize(nc); + foridx(c, nc) + mat[r][c] = v; + } + return mat; +} + +template inline void matrix_fill(vector< vector > &mat, T v) { + foridx(i, len(mat)) vector_fill(mat[i], v); +} + +template inline void vector_fill(vector &vec, T v) { + foridx(i, len(vec)) vec[i] = v; +} + +template inline T vector_sum(const vector &vec) { + T sum = 0; + foridx(i, len(vec)) sum += vec[i]; + return sum; +} + +// Returns the index of the minimum element in vec. +template inline int vector_index_min(const vector &vec) { + T min = vec[0]; + int best_i = 0; + foridx(i, len(vec)) { + if(vec[i] < min) { + min = vec[i]; + best_i = i; + } + } + return best_i; +} + +template inline int vector_min(const vector &vec) { + return vec[vector_index_min(vec)]; +} + +// Returns the index of the maximum element in vec. +template inline int vector_index_max(const vector &vec) { + T max = vec[0]; + int best_i = 0; + foridx(i, len(vec)) { + if(vec[i] > max) { + max = vec[i]; + best_i = i; + } + } + return best_i; +} + +template inline int vector_max(const vector &vec) { + return vec[vector_index_max(vec)]; +} + +// Returns the index of the maximum element in vec. +template inline IntPair matrix_index_max(const vector< vector > &mat) { + T max = mat[0][0]; + IntPair best_ij = IntPair(0, 0); + foridx(i, len(mat)) { + foridx(j, len(mat[i])) { + if(mat[i][j] > max) { + max = mat[i][j]; + best_ij = IntPair(i, j); + } + } + } + return best_ij; +} + +// Returns the sum of the elements in column c. +template inline T matrix_col_sum(const vector< vector > &mat, int c) { + T sum = 0; + foridx(r, len(mat)) sum += mat[r][c]; + return sum; +} + +template ostream &operator<<(ostream &out, const pair &p) { + return out << p.first << ' ' << p.second; +} + +template ostream &operator<<(ostream &out, const vector &vec) { + foridx(i, len(vec)) { + if(i > 0) out << ' '; + out << vec[i]; + } + return out; +} + +template ostream &operator<<(ostream &out, const vector< vector > &mat) { + foridx(r, len(mat)) out << mat[r] << endl; + return out; +} + +template vector subvector(const vector &vec, int i, int j = -1) { + int N = len(vec); + if(j < 0) j += N; + if(j < i) j = i; + + // Probably some fancy STL way to do this. + vector subvec(j-i); + foridx(k, j-i) subvec[k] = vec[i+k]; + return subvec; +} + +template vector to_vector(T arr[], int n) { + vector vec(n); + foridx(i, n) vec[i] = arr[i]; + return vec; +} + +inline IntVec to_vector(int n, ...) { + va_list ap; + IntVec vec; + va_start(ap, n); + foridx(i, n) vec.push_back(va_arg(ap, int)); + va_end(ap); + return vec; +} + +inline DoubleVec to_fvector(int n, ...) { + va_list ap; + DoubleVec vec; + va_start(ap, n); + foridx(i, n) vec.push_back(va_arg(ap, double)); + va_end(ap); + return vec; +} + +template inline void operator+=(vector &vec1, const vector &vec2) { + foridx(i, len(vec1)) vec1[i] += vec2[i]; +} + +#endif diff --git a/merge_generator/basic/str-str-db.cc b/merge_generator/basic/str-str-db.cc new file mode 100644 index 0000000..3e96ac7 --- /dev/null +++ b/merge_generator/basic/str-str-db.cc @@ -0,0 +1,35 @@ +#include "str-str-db.h" +#include "std.h" +#include "str.h" +#include "strdb.h" + +StrStrDB::~StrStrDB() { + destroy_strings(s2t); +} + +// File format: lines of \t\t<...junk...> +void StrStrDB::read(const char *file) { + track("StrStrDB::read()", file, true); + char buf[1024]; + ifstream in(file); + assert2(in, file); + + // Read the s2t for each word. + max_t_len = 0; + while(in.getline(buf, sizeof(buf))) { + char *t = strtok(buf, "\t"); + char *s = strtok(NULL, "\t"); + assert(s && t); + + assert2(!contains(s2t, s), s << " appears twice"); + s2t[copy_str(s)] = copy_str(t); + max_t_len = max(max_t_len, (int)strlen(t)); + } + logs("Read " << len(s2t) << " strings"); + logs("Longest mapped string is " << max_t_len << " characters."); +} + +const char *StrStrDB::operator[](const char *word) const { + StrStrMap::const_iterator it = s2t.find(word); + return it == s2t.end() ? "" : it->second; +} diff --git a/merge_generator/basic/str-str-db.h b/merge_generator/basic/str-str-db.h new file mode 100644 index 0000000..5a2db65 --- /dev/null +++ b/merge_generator/basic/str-str-db.h @@ -0,0 +1,19 @@ +#ifndef __STR_STR_DB_H__ +#define __STR_STR_DB_H__ + +#include "stl-basic.h" + +// Maps strings (s) to strings (t). +class StrStrDB { +public: + ~StrStrDB(); + + void read(const char *file); + const char *operator[](const char *s) const; + + int max_t_len; +private: + StrStrMap s2t; +}; + +#endif diff --git a/merge_generator/basic/str.cc b/merge_generator/basic/str.cc new file mode 100644 index 0000000..35d5e13 --- /dev/null +++ b/merge_generator/basic/str.cc @@ -0,0 +1,91 @@ +#include "stl-basic.h" +#include + +string substr(const string &s, int i, int j) { + if(i < 0) i += len(s); + if(j < 0) j += len(s); + i = max(i, 0); + j = max(j, i); + return s.substr(i, j-i); +} +string substr(const string &s, int i) { + return substr(s, i, len(s)); +} + +string str_printf(const char *fmt, ...) { + char buf[16384]; + va_list ap; + va_start(ap, fmt); + vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + return buf; +} + +char *copy_str(const char *s) { + char *t = new char[strlen(s)+1]; + strcpy(t, s); + return t; +} + +string int2str(int x) { + return str_printf("%d", x); +} + +string double2str(double x) { + ostringstream os; + os << x; + return os.str(); +} + +StringVec split(const char *str, const char *delims, bool keep_empty) { + StringVec vec; // Store the result. + // Build quick lookup table. + BoolVec is_delim(256); + for(const char *p = delims; *p; p++) is_delim[*p] = true; + is_delim['\0'] = true; + + const char *end = str; + while(true) { + if(is_delim[*end]) { + if(keep_empty || end-str > 0) // Extract token. + vec.push_back(string(str, end-str)); + str = end+1; + } + if(!*end) break; + end++; + } + return vec; +} + +StrVec mutate_split(char *str, const char *delims) { + StrVec vec; + for(char *p = strtok(str, delims); p; p = strtok(NULL, delims)) + vec.push_back(p); + return vec; +} + +// Remove leading and trailing white space. +char *trim(char *s) { + // Removing leading spaces. + while(*s && isspace(*s)) s++; + + // Remove trailing spaces. + char *t; + for(t = s+strlen(s)-1; t != s && isspace(*t); t--); + t[1] = '\0'; + return s; +} + +string tolower(const char *s) { + string t = s; + foridx(i, len(t)) t[i] = tolower(t[i]); + return t; +} + +// String matching with brute force. +int index_of(const char *s, const char *t) { + int ns = strlen(s), nt = strlen(t); + foridx(i, ns-nt+1) + if(strncmp(s+i, t, nt) == 0) return i; + return -1; +} diff --git a/merge_generator/basic/str.h b/merge_generator/basic/str.h new file mode 100644 index 0000000..15d11fa --- /dev/null +++ b/merge_generator/basic/str.h @@ -0,0 +1,22 @@ +#ifndef __STR_H__ +#define __STR_H__ + +#include "stl-basic.h" + +string substr(const string &s, int i, int j); +string substr(const string &s, int i); + +string str_printf(const char *fmt, ...); +char *copy_str(const char *s); +string int2str(int x); +string double2str(double x); + +StringVec split(const char *str, const char *delims, bool keep_empty); +StrVec mutate_split(char *str, const char *delims); + +char *trim(char *s); +string tolower(const char *s); + +int index_of(const char *s, const char *t); + +#endif diff --git a/merge_generator/basic/strdb.cc b/merge_generator/basic/strdb.cc new file mode 100644 index 0000000..ed3db6a --- /dev/null +++ b/merge_generator/basic/strdb.cc @@ -0,0 +1,209 @@ +#include "strdb.h" +#include "str.h" + +void destroy_strings(StrVec &vec) { + foridx(i, len(vec)) + delete [] vec[i]; +} + +void destroy_strings(StrStrMap &map) { + typedef const char *const_char_ptr; + StrVec strs; + formap(const_char_ptr, s, const_char_ptr, t, StrStrMap, map) { + strs.push_back(s); + strs.push_back(t); + } + destroy_strings(strs); +} + +//////////////////////////////////////////////////////////// + +int StrDB::read(istream &in, int N, bool one_way) { + char s[16384]; + clear(); + while(size() < N && in >> s) { + if(one_way) i2s.push_back(copy_str(s)); + else (*this)[s]; + } + logs(size() << " strings read"); + return size(); +} + +int StrDB::read(const char *file, bool one_way) { + track("StrDB::read()", file << ", one_way=" << one_way, true); + ifstream in(file); + assert(in); + return read(in, INT_MAX, one_way); +} + +void StrDB::write(ostream &out) { + foridx(i, size()) + out << i2s[i] << endl; + logs(size() << " strings written"); +} + +void StrDB::write(const char *file) { + track("StrDB::write()", file, true); + ofstream out(file); + write(out); +} + +const char *StrDB::operator[](int i) const { + assert(i >= 0 && i < len(i2s)); + return i2s[i]; +} + +int StrDB::lookup(const char *s, bool incorp_new, int default_i) { + StrIntMap::const_iterator it = s2i.find(s); + if(it != s2i.end()) return it->second; + if(incorp_new) { + char *t = copy_str(s); + int i = s2i[t] = len(i2s); + i2s.push_back(t); + return i; + } + else + return default_i; +} + +IntVec StrDB::lookup(const StrVec &svec) { + IntVec ivec(len(svec)); + foridx(i, len(svec)) + ivec[i] = lookup(svec[i], true, -1); + return ivec; +} + +int StrDB::operator[](const char *s) const { + StrIntMap::const_iterator it = s2i.find(s); + if(it != s2i.end()) return it->second; + return -1; +} + +int StrDB::operator[](const char *s) { + return lookup(s, true, -1); +} + +ostream &operator<<(ostream &out, const StrDB &db) { + foridx(i, len(db)) out << db[i] << endl; + return out; +} + +//////////////////////////////////////////////////////////// + +int IntPairIntDB::lookup(const IntPair &p, bool incorp_new, int default_i) { + IntPairIntMap::const_iterator it = p2i.find(p); + if(it != p2i.end()) return it->second; + + if(incorp_new) { + int i = p2i[p] = len(i2p); + i2p.push_back(p); + return i; + } + else + return default_i; +} + +int IntPairIntDB::read(istream &in, int N) { + assert(size() == 0); + int a, b; + while(size() < N && in >> a >> b) + (*this)[IntPair(a, b)]; + return size(); +} + +void IntPairIntDB::write(ostream &out) { + forvec(_, const IntPair &, p, i2p) + out << p.first << ' ' << p.second << endl; +} + +//////////////////////////////////////////////////////////// + +int IntVecIntDB::lookup(const IntVec &v, bool incorp_new, int default_i) { + IntVecIntMap::const_iterator it = v2i.find(v); + if(it != v2i.end()) return it->second; + + if(incorp_new) { + int i = v2i[v] = len(i2v); + i2v.push_back(v); + return i; + } + else + return default_i; +} + +//////////////////////////////////////////////////////////// + +// A text is basically a string of words. +// Normally, we just read the strings from file, put them in db, +// and call back func. +// But if the db already exists and the strings have been converted +// into integers (i.e., .{strdb,int} exist), then use those. +// If incorp_new is false, then words not in db will just get passed -1. +typedef void int_func(int a); +void read_text(const char *file, int_func *func, StrDB &db, bool read_cached, bool write_cached, bool incorp_new) { + track("read_text()", file, true); + + string strdb_file = string(file)+".strdb"; + string int_file = string(file)+".int"; + + // Use the cached strdb and int files only if they exist and they are + // newer than the text file. + read_cached &= file_exists(strdb_file.c_str()) && + file_exists(int_file.c_str()) && + file_modified_time(strdb_file.c_str()) > file_modified_time(file) && + file_modified_time(int_file.c_str()) > file_modified_time(file); + + if(read_cached) { + // Read from strdb and int. + assert(db.size() == 0); // db must be empty because we're going to clobber it all + db.read(strdb_file.c_str(), true); + track_block("", "Reading from " << int_file, false) { + ifstream in(int_file.c_str()); + char buf[16384]; + while(true) { + in.read(buf, sizeof(buf)); + if(in.gcount() == 0) break; + assert(in.gcount() % sizeof(int) == 0); + for(int buf_i = 0; buf_i < in.gcount(); buf_i += 4) { + int a = *((int *)(buf+buf_i)); + assert(a >= 0 && a < db.size()); + func(a); + } + } + } + } + else { + track_block("", "Reading from " << file, false) { + // Write to strdb and int. + ifstream in(file); + ofstream out; + + if(write_cached) { + out.open(int_file.c_str()); + if(!out) write_cached = false; + } + if(write_cached) logs("Writing to " << int_file); + + char s[16384]; + char buf[16384]; int buf_i = 0; // Output buffer + while(in >> s) { // Read a string + int a = db.lookup(s, incorp_new, -1); + if(func) func(a); + + if(write_cached) { + if(buf_i + sizeof(a) > sizeof(buf)) { // Flush buffer if full + out.write(buf, buf_i); + buf_i = 0; + } + *((int *)(buf+buf_i)) = a; + buf_i += sizeof(a); + } + } + if(write_cached) // Final flush + out.write(buf, buf_i); + } + + if(write_cached && create_file(strdb_file.c_str())) + db.write(strdb_file.c_str()); + } +} diff --git a/merge_generator/basic/strdb.h b/merge_generator/basic/strdb.h new file mode 100644 index 0000000..4f004ea --- /dev/null +++ b/merge_generator/basic/strdb.h @@ -0,0 +1,101 @@ +#ifndef __STRDB_H__ +#define __STRDB_H__ + +#include "std.h" +#include "stl-basic.h" +#include "stl-utils.h" +#include "logging.h" + +void destroy_strings(StrVec &vec); +void destroy_strings(StrStrMap &map); + +// Map between strings and integers. +// Strings must not have spaces in them. +// File format: strings, one per line. Assume strings are distinct. +struct StrDB { + StrDB() { } + ~StrDB() { destroy_strings(); } + + int read(istream &in, int n, bool one_way); + int read(const char *file, bool one_way); + + void write(ostream &out); + void write(const char *file); + + int size() const { return len(i2s); } + void clear() { destroy_strings(); i2s.clear(); s2i.clear(); } + void destroy() { destroy_strings(); ::destroy(i2s); ::destroy(s2i); } + void destroy_s2i() { ::destroy(s2i); } + void clear_keep_strings() { i2s.clear(); s2i.clear(); } + + const char *operator[](int i) const; + int operator[](const char *s) const; + int operator[](const char *s); + int lookup(const char *s, bool incorp_new, int default_i); + + IntVec lookup(const StrVec &svec); + + bool exists(const char *s) const { return s2i.find(s) != s2i.end(); } + + // /usr/bin/top might not show the memory reduced. + void destroy_strings() { ::destroy_strings(i2s); } + + StrVec i2s; + StrIntMap s2i; +}; + +ostream &operator<<(ostream &out, const StrDB &db); + +//////////////////////////////////////////////////////////// + +// Map between IntPairs and ints. +struct IntPairIntDB { + IntPair operator[](int i) const { return i2p[i]; } + int operator[](const IntPair &p) { return lookup(p, true, -1); } + int lookup(const IntPair &p, bool incorp_new, int default_i); + int size() const { return len(i2p); } + + int read(istream &in, int N); + void write(ostream &out); + + IntPairIntMap p2i; + IntPairVec i2p; +}; + +//////////////////////////////////////////////////////////// + +// Map between IntVecs and ints. +struct IntVecIntDB { + const IntVec &operator[](int i) const { return i2v[i]; } + int operator[](const IntVec &v) { return lookup(v, true, -1); } + int lookup(const IntVec &v, bool incorp_new, int default_i); + int size() const { return len(i2v); } + + IntVecIntMap v2i; + IntVecVec i2v; +}; + +//////////////////////////////////////////////////////////// + +#if 0 +// Map between IntArrays and ints. Arrays terminate with -1. +struct IntArrayIntDB { + int *operator[](int i) const { return i2a[i]; } + int operator[](const IntArray &a) { return lookup(a, true, -1); } + int lookup(const IntArray &a, bool incorp_new, int default_i); + int size() const { return len(i2a); } + + int read(istream &in, int N); + void write(ostream &out); + + hash_map p2i; + vector i2a; +}; +#endif + +//////////////////////////////////////////////////////////// + +typedef void int_func(int a); +void read_text(const char *file, int_func *func, StrDB &db, bool read_cached, bool write_cached, bool incorp_new); + +#endif diff --git a/merge_generator/basic/timer.cc b/merge_generator/basic/timer.cc new file mode 100644 index 0000000..b5a7f11 --- /dev/null +++ b/merge_generator/basic/timer.cc @@ -0,0 +1,11 @@ +#include "timer.h" + +ostream &operator<<(ostream &out, const Timer &timer) { + int ms = timer.ms; + int m = ms / 60000; ms %= 60000; + int h = m / 60; m %= 60; + if(h > 0) out << h << 'h'; + if(h > 0 || m > 0) out << m << 'm'; + out << ms/1000.0 << 's'; + return out; +} diff --git a/merge_generator/basic/timer.h b/merge_generator/basic/timer.h new file mode 100644 index 0000000..c0778fd --- /dev/null +++ b/merge_generator/basic/timer.h @@ -0,0 +1,35 @@ +#ifndef __TIMER_H__ +#define __TIMER_H__ + +#include +#include +#include +#include + +using namespace std; + +struct Timer { + Timer() { } + Timer(int ms) : ms(ms) { } + + //void start() { clock_gettime(0, &start_time); } + void start() { gettimeofday(&start_time, NULL); } + Timer &stop() { + //clock_gettime(0, &end_time); + gettimeofday(&end_time, NULL); + ms = Timer::to_ms(end_time) - Timer::to_ms(start_time); + return *this; + } + //static int to_ms(const timespec &tv) { return tv.tv_sec*1000 + tv.tv_nsec/1000000; } + static int to_ms(const timeval &tv) { return tv.tv_sec*1000 + tv.tv_usec/1000; } + + //timespec start_time; + //timespec end_time; + timeval start_time; + timeval end_time; + int ms; +}; + +ostream &operator<<(ostream &out, const Timer &timer); + +#endif diff --git a/merge_generator/basic/union-set.cc b/merge_generator/basic/union-set.cc new file mode 100644 index 0000000..9de748c --- /dev/null +++ b/merge_generator/basic/union-set.cc @@ -0,0 +1,29 @@ +#include "union-set.h" + +void UnionSet::Init(int n) { + parent.resize(n); + for(int v = 0; v < n; v++) + parent[v] = v; +} + +// return whether u and v are in the same connected component; +// connect them if they aren't +bool UnionSet::Do(int u, int v, bool doit) { + int ru = GetRoot(u); + int rv = GetRoot(v); + if(ru == rv) return true; + if(doit) parent[ru] = rv; + return false; +} + +int UnionSet::GetRoot(int v) { + int rv = v; + while(parent[rv] != rv) + rv = parent[rv]; + while(v != rv) { + int pv = parent[v]; + parent[v] = rv; + v = pv; + } + return rv; +} diff --git a/merge_generator/basic/union-set.h b/merge_generator/basic/union-set.h new file mode 100644 index 0000000..eb3f5e9 --- /dev/null +++ b/merge_generator/basic/union-set.h @@ -0,0 +1,22 @@ +#ifndef __UNION_SET_H__ +#define __UNION_SET_H__ + +#include + +using namespace std; + +struct UnionSet { + UnionSet() { } + UnionSet(int n) { Init(n); } + void Init(int n); + + bool Join(int u, int v) { return Do(u, v, true); } + bool InSameSet(int u, int v) { return Do(u, v, false); } + + bool Do(int u, int v, bool doit); + int GetRoot(int v); + + vector parent; +}; + +#endif diff --git a/merge_generator/cluster-viewer/LICENSE b/merge_generator/cluster-viewer/LICENSE new file mode 100644 index 0000000..1a2a94a --- /dev/null +++ b/merge_generator/cluster-viewer/LICENSE @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) 2014 Chris Dyer and Brendan O'Connor + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/merge_generator/cluster-viewer/README.md b/merge_generator/cluster-viewer/README.md new file mode 100644 index 0000000..2a66679 --- /dev/null +++ b/merge_generator/cluster-viewer/README.md @@ -0,0 +1,26 @@ +This code generates an HTML viewer for the clustering tree generated, similar to [this clustering of the words in a corpus of English Twitter data](http://www.ark.cs.cmu.edu/TweetNLP/cluster_viewer.html). + +## Instructions + +The `wcluster` tool generates a directory with a file called `paths` that contains the bit string representations of the clustering tree, e.g. + + 000000 Westfalenpokalfinale 10 + 000000 Heimpunktspiel 10 + 000000 Jugendhallenturnier 10 + ... + +The script `cluster-viewer/build-viewer.sh` creates an HTML visualization of the contents of this file. You can run it with as follows: + + ./cluster-viewer/build-viewer.sh corpus.out/paths + +This command creates a directory called `clusters/` containing the HTML viewer. Specify an alternative directory as follows: + + ./cluster-viewer/build-viewer.sh corpus.out/paths /some/other/output-dir + +## Requirements + + * Python must be in your path + +## Acknowledgements + +These scripts were originally written by [Brendan O'Connor](http://brenocon.com/) and extended by [Chris Dyer](http://www.cs.cmu.edu/~cdyer/). diff --git a/merge_generator/cluster-viewer/build-viewer.sh b/merge_generator/cluster-viewer/build-viewer.sh new file mode 100755 index 0000000..1476a09 --- /dev/null +++ b/merge_generator/cluster-viewer/build-viewer.sh @@ -0,0 +1,32 @@ +#!/bin/bash +set -e + +CODEDIR=`dirname $0`/code + +if [ "$#" -lt "1" ] || [ "$#" -gt "2" ] +then + echo "Usage: $0 path/to/clusters.out/paths [outdir]" 1>&2 + echo 1>&2 + echo "Builds an HTML cluster viewer." 1>&2 + echo 1>&2 + exit +fi +MAPFILE=$1 +CATCMD=cat +if [[ "$MAPFILE" == *.gz ]] +then + CATCMD='gunzip -c' +fi +OUTDIR=clusters +if [ $# -eq 2 ] +then + OUTDIR=$2 +fi + +echo "Creating output in $OUTDIR ..." 1>&2 +mkdir -p $OUTDIR +mkdir -p $OUTDIR/paths +$CATCMD $MAPFILE | python $CODEDIR/make_html.py $CODEDIR $OUTDIR > $OUTDIR/htmlrows.html +python $CODEDIR/final.py $CODEDIR $OUTDIR > $OUTDIR/cluster_viewer.html +echo "Done. View clusters in $OUTDIR/cluster_viewer.html" 1>&2 + diff --git a/merge_generator/cluster-viewer/code/final.py b/merge_generator/cluster-viewer/code/final.py new file mode 100644 index 0000000..d12b1fd --- /dev/null +++ b/merge_generator/cluster-viewer/code/final.py @@ -0,0 +1,8 @@ +import sys +template = open(sys.argv[1] + '/template.html').read() +final = template +final = final.replace('STYLE', open(sys.argv[1] + '/style.css').read()) +htmlrows = open(sys.argv[2] + '/htmlrows.html').read() +final = final.replace('TABLE', htmlrows) +print final + diff --git a/merge_generator/cluster-viewer/code/htmlrows.html b/merge_generator/cluster-viewer/code/htmlrows.html new file mode 100644 index 0000000..0a9ccc3 --- /dev/null +++ b/merge_generator/cluster-viewer/code/htmlrows.html @@ -0,0 +1,18 @@ + + + ^000000 (3) + Westfalenpokalfinale Heimpunktspiel Jugendhallenturnier + + + + + ^0000010 (3) + Friesendorf Fallenstellen Strafjustizsystem + + + + + ^00000110 (3) + Gewerbeflächenkonzept Musikprotokoll Familienbetreuungszentrum + + diff --git a/merge_generator/cluster-viewer/code/make_html.py b/merge_generator/cluster-viewer/code/make_html.py new file mode 100644 index 0000000..ee061c5 --- /dev/null +++ b/merge_generator/cluster-viewer/code/make_html.py @@ -0,0 +1,75 @@ +import sys,itertools + +style = open(sys.argv[1] + '/style.css').read() + +def get_word_rows(): + for line in sys.stdin: + path, word, count = line.split('\t') + count = int(count) + yield path,word,count + +def get_cluster_rows(): + for path, rows in itertools.groupby(get_word_rows(), key=lambda x: x[0]): + wordcounts = [(w,c) for _,w,c in rows] + wordcounts.sort(key=lambda (w,c): -c) + + yield path, len(wordcounts), wordcounts[:50], wordcounts + +def htmlescape(s): + return s.replace('&','&').replace('<','<').replace('>','>') + +def wc_table(wordcounts, tdword=''): + r = [''] + for i,(w,c) in enumerate(wordcounts): + r.append('
{} {} {:,}'.format(i+1, tdword, htmlescape(w), c)) + r.append('
') + return '\n'.join(r) + +def top(wc, th): + cutoff = int(wc[0][1] * th) + res = [] + for (w,c) in wc: + if c > cutoff: res.append((w,c)) + return res + +for path, nwords, wordcounts, allwc in get_cluster_rows(): + # wc1 = ' '.join("{w}[{c}]".format( + # w=htmlescape(w), c=c) for w,c in wordcounts) + wc1 = ' '.join("{w}".format( + w=htmlescape(w)) for w,c in top(wordcounts, 0.01)) + + print """ + + ^{path} ({nwords}) + {wc} + """.format(path=path, nwords=nwords, wc=wc1) + print "" + + with open(sys.argv[2] + '/paths/{path}.html'.format(**locals()),'w') as f: + print>>f,"""""".format(**locals()) + print>>f,"""""" + print>>f,"back to cluster viewer" + print>>f,"

cluster path {path}

".format(path=path) + + print>>f, "{n:,} words, {t:,} tokens".format(n=nwords, t=sum(c for w,c in allwc)) + print>>f, "freq alpha suffix" + + print>>f,"

Words in frequency order

" + allwc.sort(key=lambda (w,c): (-c,w)) + print>>f, wc_table(allwc) + # wc1 = ' '.join("{w} ({c})".format( + # w=htmlescape(w), c=c) for w,c in allwc) + # print>>f, wc1 + + print>>f, "

Words in alphabetical order

" + allwc.sort(key=lambda (w,c): (w,-c)) + print>>f, wc_table(allwc) + + print>>f, "

Words in suffix order

" + allwc.sort(key=lambda (w,c): (list(reversed(w)),-c)) + print>>f, wc_table(allwc, tdword='suffixsort') + # wc1 = ' '.join("{w} ({c})".format( + # w=htmlescape(w), c=c) for w,c in allwc) + # print>>f, wc1 + + diff --git a/merge_generator/cluster-viewer/code/style.css b/merge_generator/cluster-viewer/code/style.css new file mode 100644 index 0000000..53c426f --- /dev/null +++ b/merge_generator/cluster-viewer/code/style.css @@ -0,0 +1,9 @@ +table { border-collapse:collapse; border-spacing:0; } +body { font-family: times; font-size: 11pt; } +td { border: 1px solid gray; padding:2px 8px; } +th { border: 1px solid gray; padding:2px 8px; } +.count { font-size:9pt; color: solid gray; } +.c { font-size:7pt; color: solid gray; } +.tdcount { text-align:right } +.info { font-size: 12pt; } +.suffixsort { text-align: right } diff --git a/merge_generator/cluster-viewer/code/template.html b/merge_generator/cluster-viewer/code/template.html new file mode 100644 index 0000000..3ce7086 --- /dev/null +++ b/merge_generator/cluster-viewer/code/template.html @@ -0,0 +1,22 @@ + + + + +

Word cluster viewer

+ +
+Word cluster viewer. +
+ +

+ + + + TABLE +
Cluster path (and word type count) + Words (most frequent) +
+ + diff --git a/merge_generator/input.txt b/merge_generator/input.txt new file mode 100644 index 0000000..c7ab443 --- /dev/null +++ b/merge_generator/input.txt @@ -0,0 +1,3 @@ +the cat chased the mouse +the dog chased the cat +the mouse chased the dog diff --git a/merge_generator/output.txt b/merge_generator/output.txt new file mode 100644 index 0000000..fabe266 --- /dev/null +++ b/merge_generator/output.txt @@ -0,0 +1,5 @@ +0 the 6 +10 chased 3 +110 dog 2 +1110 mouse 2 +1111 cat 2 diff --git a/merge_generator/wcluster.cc b/merge_generator/wcluster.cc new file mode 100755 index 0000000..b6c9b24 --- /dev/null +++ b/merge_generator/wcluster.cc @@ -0,0 +1,1238 @@ +/* +Hierarchically clusters phrases. +Running time: O(N*C^2). + +We want to cluster the phrases so that the pairwise mututal information between +clusters is maximized. This mututal information is a sum over terms between +each pair of clusters: q2[a, b] for clusters a and b. The trick is to compute +quickly the loss of mututal information when two clusters a and b are merged. + +The four structures p1, p2, q2, L2 allow this quick computation. + p1[a] = probability of of cluster a. + p2[a, b] = probability of cluster a followed by cluster b. + q2[a, b] = contribution to the mutual information from clusters a and b (computed from p2[a, b]). + L2[a, b] = the loss of mutual information if clusters a and b were merged. + +Changes: + * Removed hash tables for efficiency. + * Notation: a is an phrase (sequence of words), c is a cluster, s is a slot. +* Removed hash tables for efficiency. +* Notation: a is an phrase (sequence of words), c is a cluster, s is a slot. + +To cut down memory usage: + * Change double to float. +Ideas: + * Hashing vectors is really slow. + * Find intuition behind algorithm based on simple cases + * Test clustering algorithm on artificial generated data. Generate a text + with a class-based ngram model. +*/ + +#include "basic/std.h" +#include "basic/stl-basic.h" +#include "basic/stl-utils.h" +#include "basic/str.h" +#include "basic/strdb.h" +#include "basic/union-set.h" +#include "basic/mem-tracker.h" +#include "basic/opt.h" +#include +#include +#include +#include +#include + +#define INPUT_DEFAULT_A 1000 /* Part of Generalised Brown */ + +vector< OptInfo > bool_opts; +vector< OptInfo > int_opts; +vector< OptInfo > double_opts; +vector< OptInfo > string_opts; + +opt_define_string(output_dir, "output_dir", "", "Output everything to this directory."); +opt_define_string(text_file, "text", "", "Text file with corpora (input)."); +opt_define_string(restrict_file, "restrict", "", "Only consider words that appear in this text (input)."); +opt_define_string(paths_file, "paths", "", "File containing root-to-node paths in the clustering tree (input/output)."); +opt_define_string(map_file, "map", "", "File containing lots of good information about each phrase, more general than paths (output)"); +opt_define_string(collocs_file, "collocs", "", "Collocations with most mutual information (output)."); +opt_define_string(featvec_file, "featvec", "", "Feature vectors (output)."); +opt_define_string(comment, "comment", "", "Description of this run."); + +opt_define_int(ncollocs, "ncollocs", 500, "Collocations with most mutual information (output)."); +opt_define_int(initC, "c", 1000, "Number of clusters."); +opt_define_int(initA, "a", INPUT_DEFAULT_A, "Active set size (alias for C)"); /* Generalised Brown terminology */ +opt_define_int(plen, "plen", 1, "Maximum length of a phrase to consider."); +opt_define_int(min_occur, "min-occur", 1, "Keep phrases that occur at least this many times."); +opt_define_int(rand_seed, "rand", time(NULL)*getpid(), "Number to call srand with."); +opt_define_int(num_threads, "threads", 1, "Number of threads to use in the worker pool."); + +opt_define_bool(chk, "chk", false, "Check data structures are valid (expensive)."); +opt_define_bool(print_stats, "stats", false, "Just print out stats."); +opt_define_bool(paths2map, "paths2map", false, "Take the paths file and generate a map file."); + +#define use_restrict (!restrict_file.empty()) +const char *delim_str = "$#$"; + +typedef IntPair _; + +StrDB db; // word database +IntVec phrase_freqs; // phrase a < N -> number of times a appears in the text +IntVecVec left_phrases; // phrase a < N -> list of phrases that appear to left of a in the text +IntVecVec right_phrases; // phrase a < N -> list of phrases that appear to right of a in the text +IntIntPairMap cluster_tree; // cluster c -> the 2 sub-clusters that merged to create c +int delim_word; + +IntVec freq_order_phrases; // List of phrases in decreasing order of frequency. + +// Allows for very quick (inverse Ackermann) lookup of clusters and merging +// of clusters. Each phrase points to an arbitrary representative phrase of +// the cluster. +UnionSet phrase2rep; // phrase a -> the rep phrase in the same cluster as a +IntIntMap rep2cluster; // rep phrase a -> the cluster that contains a +IntIntMap cluster2rep; // cluster a -> the rep phrase in cluster a + +// Store all the phrases efficiently. Just for printing out. +// For each phrase length, we store a flattened list of words. +IntVecVec phrases; // length of phrase -> flattened list of words + +// Each cluster will occupy a slot. There will always be two extra slots +// as intermediate scratch space. +IntVec slot2cluster; // slot index -> cluster (-1 if none exists) +IntIntMap cluster2slot; // cluster -> slot index +int free_slot1, free_slot2; // two free slots +int nslots; + +// Partial results that allow quick computation and update of mutual information. +// Mutual information is the sum of all the q2 terms. +// Update p1, p2, q2 for 0..N-1, but L2 only for 0..initC-1. +DoubleVec p1; // slot s (containing cluster a) -> probability Pr(a) +DoubleVecVec p2; // slots s, t (containing clusters a, b) -> probability Pr(a, b) +DoubleVecVec q2; // slots s, t (contianing clusters a, b) -> contribution to mutual information +DoubleVecVec L2; // slots s, t (containing clusters a, b) -> loss of mutual information if merge a and b + +int curr_cluster_id; // ID to assign to a new cluster +int stage2_cluster_offset; // start of the IDs of clusters created in stage 2 + +double curr_minfo; // Mutual info, should be sum of all q2's + +// Map phrase to the KL divergence to its cluster +DoubleVec kl_map[2]; + +// Variables used to control the thread pool +mutex * thread_idle; +mutex * thread_start; +thread * threads; +struct Compute_L2_Job { + int s; + int t; + int u; + bool is_type_a; +}; +Compute_L2_Job the_job; +bool all_done = false; + +#define FOR_SLOT(s) \ + for(int s = 0; s < len(slot2cluster); s++) \ + for(bool _tmp = true; slot2cluster[s] != -1 && _tmp; _tmp = false) + +// We store only L2[s, t] for which the cluster ID in slot s is smaller +// than the one in slot t. +#define ORDER_VALID(s, t) (slot2cluster[s] < slot2cluster[t]) + +#define num_phrases(l) (len(phrases[l])/(l)) + +int N; // number of phrases +int T; // length of text + +/** + * Log file in which to record merge info, part of + * Generalised Brown patch for cluster.py compatability. + */ +ofstream merge_log; + +// Output a phrase. +struct Phrase { Phrase(int a) : a(a) { } int a; }; +ostream &operator<<(ostream &out, const Phrase &phrase) { + // Decode the phrase ID into the length and the offset in phrases. + int a = phrase.a; + int l; for(l = 1; a >= num_phrases(l); a -= num_phrases(l), l++); + + foridx(i, l) { + if(i > 0) out << ' '; + out << db[phrases[l][a*l+i]]; + } + return out; +} + +// For pretty-printing of clusters. +struct Cluster { Cluster(int c) : c(c) { } int c; }; +ostream &operator<<(ostream &out, const Cluster &cluster) { + int c = cluster.c; + out << c; + + int a; + bool more; + if(c < N) + a = c, more = false; + else { + assert(contains(cluster2rep, c)); + a = cluster2rep[c], more = true; + } + + out << '(' << Phrase(a); + if(more) out << "|..."; + out << ')'; + return out; +} + +#define Slot(s) Cluster(slot2cluster[s]) + +//////////////////////////////////////////////////////////// + +// p2[s, t] + p2[t, s]. +inline double bi_p2(int s, int t) { + if(s == t) return p2[s][s]; + return p2[s][t] + p2[t][s]; +} + +// q2[s, t] + q2[t, s]. +inline double bi_q2(int s, int t) { + if(s == t) return q2[s][s]; + return q2[s][t] + q2[t][s]; +} + +// Hypothetical p1[st] = p1[s] + p1[t]. +inline double hyp_p1(int s, int t) { + return p1[s] + p1[t]; +} + +//// hyp_p2 + +// Hypothetical p2[st, u] = p2[s, u] + p2[t, u]. +inline double hyp_p2(const IntPair &st, int u) { + return p2[st.first][u] + p2[st.second][u]; +} + +// Hypothetical p2[u, st] = p2[u, s] + p2[u, t]. +inline double hyp_p2(int u, const IntPair &st) { + return p2[u][st.first] + p2[u][st.second]; +} + +inline double bi_hyp_p2(const IntPair &st, int u) { + return hyp_p2(st, u) + hyp_p2(u, st); +} + +// Hypothetical p2[st, st] = p2[s, s] + p2[s, t] + p2[t, s] + p2[t, t]. +inline double hyp_p2(const IntPair &st) { + return p2[st.first][st.first] + p2[st.first][st.second] + + p2[st.second][st.first] + p2[st.second][st.second]; +} + +//// hyp_q2 + +inline double p2q(double pst, double ps, double pt) { + if(feq(pst, 0.0)) return 0.0; + return pst * log2(pst / (ps*pt)); +} + +// Hypothetical q2[st, u]. +inline double hyp_q2(const IntPair &st, int u) { + return p2q(hyp_p2(st, u), hyp_p1(st.first, st.second), p1[u]); +} + +// Hypothetical q2[u, st]. +inline double hyp_q2(int u, const IntPair &st) { + return p2q(hyp_p2(u, st), hyp_p1(st.first, st.second), p1[u]); +} + +inline double bi_hyp_q2(const IntPair &st, int u) { + return hyp_q2(st, u) + hyp_q2(u, st); +} + +// Hypothetical q2[st, st]. +inline double hyp_q2(const IntPair &st) { + double p = hyp_p2(_(st.first, st.second)); // p2[st,st] + double P = hyp_p1(st.first, st.second); + return p2q(p, P, P); +} + +//////////////////////////////////////////////////////////// + +// Return slot. +void put_cluster_in_slot(int a, int s) { + cluster2slot[a] = s; + slot2cluster[s] = a; +} +inline int put_cluster_in_free_slot(int a) { + int s = -1; + + // Find available slot. + if(free_slot1 != -1) s = free_slot1, free_slot1 = -1; + else if(free_slot2 != -1) s = free_slot2, free_slot2 = -1; + assert(s != -1); + + put_cluster_in_slot(a, s); + return s; +} + +inline void free_up_slots(int s, int t) { + free_slot1 = s; + free_slot2 = t; + cluster2slot.erase(slot2cluster[s]); + cluster2slot.erase(slot2cluster[t]); + slot2cluster[s] = slot2cluster[t] = -1; +} + +void init_slot(int s) { + // Clear any entries that relates to s. + // The p1 and L2 will be filled in densely, so they + // will be overwritten anyway. + FOR_SLOT(t) + p2[s][t] = q2[s][t] = p2[t][s] = q2[t][s] = 0; +} + +void add_to_set(const IntVec &phrases, IntIntMap &phrase_counts, int offset) { + forvec(_, int, a, phrases) + phrase_counts[a+offset]++; +} + +bool is_good_phrase(const IntVec &phrase) { + if(len(phrase) == 1) return phrase[0] != delim_word && phrase[0] != -1; // Can't be delimiter or an invalid word + + // HACK HACK HACK - pick out some phrases + // Can't be too many delim words. + int di = index_of(phrase, delim_word, 1); + if(di > 0 && di < len(phrase)-1) return false; // Delimiter must occur at the ends + if(phrase[0] == delim_word && phrase[len(phrase)-1] == delim_word) return false; // Only one delimiter allowed + + // If every word is capitalized with the exception of some function + // words which must go in the middle + forvec(i, int, a, phrase) { + bool at_end = i == 0 || i == len(phrase)-1; + const string &word = db[a]; + bool is_upper = isupper(word[0]); + + if(at_end && !is_upper) return false; // Ends must be uppercase + if(is_upper) continue; // Ok + if(word[0] == '\'' || word == "of" || word == "and") continue; // Ok + return false; + } + return true; +} + +void read_restrict_text() { + // Read the words from the text file that restricts what words we will cluster + if(restrict_file.empty()) return; + track("read_restrict_text()", restrict_file, false); + read_text(restrict_file.c_str(), NULL, db, false, false, true); +} + +IntVecIntMap vec2phrase; +IntVec text; +void read_text_process_word(int w) { + text.push_back(w); +} +void read_text() { + track("read_text()", "", false); + + read_text(text_file.c_str(), read_text_process_word, db, !use_restrict, !use_restrict, !use_restrict); + T = len(text); + delim_word = db.lookup(delim_str, false, -1); + if(!paths2map) db.destroy_s2i(); // Conserve memory. + + // Count the phrases that we care about so we can map them all to integers. + track_block("Counting phrases", "", false) { + phrases.resize(plen+1); + for(int l = 1; l <= plen; l++) { + // Count. + IntVecIntMap freqs; // phrase vector -> number of occurrences + for(int i = 0; i < T-l+1; i++) { + IntVec a_vec = subvector(text, i, i+l); + if(!is_good_phrase(a_vec)) continue; + freqs[a_vec]++; + } + + forcmap(const IntVec &, a_vec, int, count, IntVecIntMap, freqs) { + if(count < min_occur) continue; + + int a = len(phrase_freqs); + phrase_freqs.push_back(count); + vec2phrase[a_vec] = a; + forvec(_, int, w, a_vec) phrases[l].push_back(w); + } + + logs(len(freqs) << " distinct phrases of length " << l << ", keeping " << num_phrases(l) << " which occur at least " << min_occur << " times"); + } + } + + N = len(phrase_freqs); // number of phrases + + track_block("Finding left/right phrases", "", false) { + left_phrases.resize(N); + right_phrases.resize(N); + for(int l = 1; l <= plen; l++) { + for(int i = 0; i < T-l+1; i++) { + IntVec a_vec = subvector(text, i, i+l); + if(!contains(vec2phrase, a_vec)) continue; + int a = vec2phrase[a_vec]; + + // Left + for(int ll = 1; ll <= plen && i-ll >= 0; ll++) { + IntVec aa_vec = subvector(text, i-ll, i); + if(!contains(vec2phrase, aa_vec)) continue; + int aa = vec2phrase[aa_vec]; + left_phrases[a].push_back(aa); + //logs(i << ' ' << Cluster(a) << " L"); + } + + // Right + for(int ll = 1; ll <= plen && i+l+ll <= T; ll++) { + IntVec aa_vec = subvector(text, i+l, i+l+ll); + if(!contains(vec2phrase, aa_vec)) continue; + int aa = vec2phrase[aa_vec]; + right_phrases[a].push_back(aa); + //logs(i << ' ' << Cluster(a) << " R"); + } + } + } + } + +#if 1 + if(!featvec_file.empty()) { + ofstream out(featvec_file.c_str()); + out << N << ' ' << 2*N << endl; + foridx(a, N) { + IntIntMap phrase_counts; + add_to_set(left_phrases[a], phrase_counts, 0); + add_to_set(right_phrases[a], phrase_counts, N); + out << Phrase(a) << ' ' << len(phrase_counts); + forcmap(int, b, int, count, IntIntMap, phrase_counts) + out << '\t' << b << ' ' << count; + out << endl; + } + } +#endif + +#if 0 + foridx(a, N) { + track("", Cluster(a), true); + forvec(_, int, b, left_phrases[a]) + logs("LEFT " << Cluster(b)); + forvec(_, int, b, right_phrases[a]) + logs("RIGHT " << Cluster(b)); + } +#endif + + destroy(text); + initC = min(initC, N); + + logs("Text length: " << T << ", " << N << " phrases, " << len(db) << " words"); +} + +// O(C) time. +double compute_s1(int s) { // compute s1[s] + double q = 0.0; + + for(int t = 0; t < len(slot2cluster); t++) { + if (slot2cluster[t] == -1) continue; + q += bi_q2(s, t); + } + + return q; +} + +// O(C) time. +double compute_L2(int s, int t) { // compute L2[s, t] + assert(ORDER_VALID(s, t)); + // st is the hypothetical new cluster that combines s and t + + // Lose old associations with s and t + double l = 0.0; + for (int w = 0; w < len(slot2cluster); w++) { + if ( slot2cluster[w] == -1) continue; + l += q2[s][w] + q2[w][s]; + l += q2[t][w] + q2[w][t]; + } + l -= q2[s][s] + q2[t][t]; + l -= bi_q2(s, t); + + // Form new associations with st + FOR_SLOT(u) { + if(u == s || u == t) continue; + l -= bi_hyp_q2(_(s, t), u); + } + l -= hyp_q2(_(s, t)); // q2[st, st] + return l; +} + +void repcheck() { + if(!chk) return; + double sum; + + assert_eq(len(rep2cluster), len(cluster2rep)); + assert_eq(len(rep2cluster), len(cluster2slot)); + + assert(free_slot1 == -1 || slot2cluster[free_slot1] == -1); + assert(free_slot2 == -1 || slot2cluster[free_slot2] == -1); + FOR_SLOT(s) { + assert(contains(cluster2slot, slot2cluster[s])); + assert(cluster2slot[slot2cluster[s]] == s); + } + + sum = 0.0; + FOR_SLOT(s) FOR_SLOT(t) { + double q = q2[s][t]; + //logs(s << ' ' << t << ' ' << p2[s][t] << ' ' << p1[s] << ' ' << p1[t]); + assert_feq(q, p2q(p2[s][t], p1[s], p1[t])); + sum += q; + } + assert_feq(sum, curr_minfo); + + FOR_SLOT(s) FOR_SLOT(t) { + if(!ORDER_VALID(s, t)) continue; + double l = L2[s][t]; + assert(l + TOL >= 0); + assert_feq(l, compute_L2(s, t)); + } +} + +void dump() { + track("dump()", "", true); + FOR_SLOT(s) logs("p1[" << Slot(s) << "] = " << p1[s]); + FOR_SLOT(s) FOR_SLOT(t) logs("p2[" << Slot(s) << ", " << Slot(t) << "] = " << p2[s][t]); + FOR_SLOT(s) FOR_SLOT(t) logs("q2[" << Slot(s) << ", " << Slot(t) << "] = " << q2[s][t]); + FOR_SLOT(s) FOR_SLOT(t) logs("L2[" << Slot(s) << ", " << Slot(t) << "] = " << L2[s][t]); + logs("curr_minfo = " << curr_minfo); +} + + +// c is new cluster that has been just formed from a and b +// Want to compute L2[d, e] +// O(1) time. +double compute_L2_using_old(int s, int t, int u, int v, int w) { + assert(ORDER_VALID(v, w)); + assert(v != u && w != u); + + double l = L2[v][w]; + + // Remove old associations between v and w with s and t + l -= bi_q2(v, s) + bi_q2(w, s) + bi_q2(v, t) + bi_q2(w, t); + l += bi_hyp_q2(_(v, w), s) + bi_hyp_q2(_(v, w), t); + + // Add new associations between v and w with u (ab) + l += bi_q2(v, u) + bi_q2(w, u); + l -= bi_hyp_q2(_(v, w), u); + + return l; +} + +// return q2 +double set_p2_q2_from_count(int s, int t, int count) { + double pst = (double)count / (T-1); // p2[s,t] + double ps = p1[s]; + double pt = p1[t]; + double qst = p2q(pst, ps, pt); // q2[s,t] + p2[s][t] = pst; + q2[s][t] = qst; + return qst; +} + +// O(N lg N) time. +// Sort the phrases by decreasing frequency and then set the initC most frequent +// phrases to be in the initial cluster. +bool phrase_freq_greater(int a, int b) { + return phrase_freqs[a] > phrase_freqs[b]; +} +void create_initial_clusters() { + track("create_initial_clusters()", "", true); + + freq_order_phrases.resize(N); + foridx(a, N) freq_order_phrases[a] = a; + + logs("Sorting " << N << " phrases by frequency"); + sort(freq_order_phrases.begin(), freq_order_phrases.end(), phrase_freq_greater); + + // Initialize slots + logs("Selecting top " << initC << " phrases to be initial clusters"); + nslots = initC+2; + slot2cluster.resize(nslots); + free_up_slots(initC, initC+1); + + // Create the inital clusters. + phrase2rep.Init(N); // Init union-set: each phrase starts out in its own cluster + curr_minfo = 0.0; + foridx(s, initC) { + int a = freq_order_phrases[s]; + put_cluster_in_slot(a, s); + + rep2cluster[a] = a; + cluster2rep[a] = a; + } + + // Allocate memory + p1.resize(nslots); + matrix_resize(p2, nslots, nslots); + matrix_resize(q2, nslots, nslots); + matrix_resize(L2, nslots, nslots); + + FOR_SLOT(s) init_slot(s); + + // Compute p1 + FOR_SLOT(s) { + int a = slot2cluster[s]; + p1[s] = (double)phrase_freqs[a] / T; + } + + // Compute p2, q2, curr_minfo + FOR_SLOT(s) { + int a = slot2cluster[s]; + IntIntMap right_phrase_freqs; + + // Find collocations of (a, b), where both are clusters. + forvec(_, int, b, right_phrases[a]) + if(contains(cluster2slot, b)) + right_phrase_freqs[b]++; + + forcmap(int, b, int, count, IntIntMap, right_phrase_freqs) { + int t = cluster2slot[b]; + curr_minfo += set_p2_q2_from_count(s, t, count); + } + } +} + +// Output the ncollocs bigrams that have the highest mutual information. +void output_best_collocations() { + if(collocs_file.empty()) return; + logs("Writing to " << collocs_file); + + vector< pair > collocs; + FOR_SLOT(s) FOR_SLOT(t) { + collocs.push_back(pair(q2[s][t], _(slot2cluster[s], slot2cluster[t]))); + } + ncollocs = min(ncollocs, len(collocs)); + partial_sort(collocs.begin(), collocs.begin()+ncollocs, collocs.end(), greater< pair >()); + + ofstream out(collocs_file.c_str()); + assert(out); + for(int i = 0; i < ncollocs; i++) { + const IntPair &ab = collocs[i].second; + out << collocs[i].first << '\t' << Phrase(ab.first) << '\t' << Phrase(ab.second) << endl; + } +} + +// O(C^3) time. +void compute_L2() { + track("compute_L2()", "", true); + + track_block("Computing L2", "", false) + + for(int s = 0; s < (int)(slot2cluster).size(); s++) \ + for(bool _tmp = true; slot2cluster[s] != -1 && _tmp; _tmp = false) { + track_block("L2", "L2[" << Slot(s) << ", *]", false) +#pragma omp parallel for schedule (dynamic,16) + for(int t = 0; t < (int)(slot2cluster).size(); t++) \ + for(bool _tmp = true; slot2cluster[t] != -1 && _tmp; _tmp = false) { + if(!ORDER_VALID(s, t)) continue; + double l = L2[s][t] = compute_L2(s, t); + logs("L2[" << Slot(s) << "," << Slot(t) << "] = " << l << ", resulting minfo = " << curr_minfo-l); + } + } +} + +// Add new phrase as a cluster. +// Compute its L2 between a and all existing clusters. +// O(C^2) time, O(T) time over all calls. +void incorporate_new_phrase(int a) { + track("incorporate_new_phrase()", Cluster(a), false); + + int s = put_cluster_in_free_slot(a); + init_slot(s); + cluster2rep[a] = a; + rep2cluster[a] = a; + + // Compute p1 + p1[s] = (double)phrase_freqs[a] / T; + + // Overall all calls: O(T) + // Compute p2, q2 between a and everything in clusters + IntIntMap freqs; + freqs.clear(); // right bigrams + forvec(_, int, b, right_phrases[a]) { + b = phrase2rep.GetRoot(b); + if(!contains(rep2cluster, b)) continue; + b = rep2cluster[b]; + if(!contains(cluster2slot, b)) continue; + freqs[b]++; + } + forcmap(int, b, int, count, IntIntMap, freqs) { + curr_minfo += set_p2_q2_from_count(cluster2slot[a], cluster2slot[b], count); + logs(Cluster(a) << ' ' << Cluster(b) << ' ' << count << ' ' << set_p2_q2_from_count(cluster2slot[a], cluster2slot[b], count)); + } + + freqs.clear(); // left bigrams + forvec(_, int, b, left_phrases[a]) { + b = phrase2rep.GetRoot(b); + if(!contains(rep2cluster, b)) continue; + b = rep2cluster[b]; + if(!contains(cluster2slot, b)) continue; + freqs[b]++; + } + forcmap(int, b, int, count, IntIntMap, freqs) { + curr_minfo += set_p2_q2_from_count(cluster2slot[b], cluster2slot[a], count); + logs(Cluster(b) << ' ' << Cluster(a) << ' ' << count << ' ' << set_p2_q2_from_count(cluster2slot[b], cluster2slot[a], count)); + } + + curr_minfo -= q2[s][s]; // q2[s, s] was double-counted + + // Update L2: O(C^2) + track_block("Update L2", "", false) { + + the_job.s = s; + the_job.is_type_a = true; + // start the jobs + for (int ii=0; ii number of times a-b appears + IntIntMap count1; // cluster a -> number of times a appears + + // Compute cluster distributions + foridx(a, N) { + int ca = phrase2cluster(a); + forvec(_, int, b, right_phrases[a]) { + int cb = phrase2cluster(b); + count2[IntPair(ca, cb)]++; + count1[ca]++; + count1[cb]++; + } + } + + // For each word (phrase), compute its distribution + kl_map[0].resize(N); + kl_map[1].resize(N); + foridx(a, N) { + int ca = phrase2cluster(a); + IntIntMap a_count2; + int a_count1 = 0; + real kl; + + // Left distribution + a_count2.clear(), a_count1 = 0; + forvec(_, int, b, left_phrases[a]) { + int cb = phrase2cluster(b); + a_count2[cb]++; + a_count1++; + } + kl = kl_map[0][a] = kl_divergence(a_count2, a_count1, count2, count1, ca, false); + //logs("Left-KL(" << Phrase(a) << " | " << Cluster(ca) << ") = " << kl); + + // Right distribution + a_count2.clear(), a_count1 = 0; + forvec(_, int, b, right_phrases[a]) { + int cb = phrase2cluster(b); + a_count2[cb]++; + a_count1++; + } + kl = kl_map[1][a] = kl_divergence(a_count2, a_count1, count2, count1, ca, true); + //logs("Right-KL(" << Phrase(a) << " | " << Cluster(ca) << ") = " << kl); + } +} + +int word2phrase(int a) { + IntVecIntMap::const_iterator it = vec2phrase.find(to_vector(1, a)); + return it == vec2phrase.end() ? -1 : it->second; +} + +// Read in from paths_file and fill in phrase2rep, rep2cluster +void convert_paths_to_map() { + track("convert_paths_to_map()", "", false); + assert(!paths_file.empty() && !map_file.empty()); + + // Read clusters + ifstream in(paths_file.c_str()); + char buf[1024]; + typedef unordered_map SSVMap; + SSVMap map; + while(in.getline(buf, sizeof(buf))) { + char *path = strtok(buf, "\t"); + char *word = strtok(NULL, "\t"); + assert(word && path); + map[path].push_back(word); + } + + // Create the inital clusters. + phrase2rep.Init(N); // Init union-set: each phrase starts out in its own cluster + foridx(a, N) { + rep2cluster[a] = a; + cluster2rep[a] = a; + } + + // Merge clusters + curr_cluster_id = N; // New cluster ids will start at N, after all the phrases. + forcmap(const string &, path, const StringVec &, words, SSVMap, map) { + int a = -1; + forvec(i, const string &, word, words) { + int b = word2phrase(db.lookup(word.c_str(), false, -1)); + if(b == -1) continue; + if(a != -1) { + // Record merge in the cluster tree + int c = curr_cluster_id++; + cluster_tree[c] = _(a, b); + + // Update relationship between clusters and rep phrases + int A = cluster2rep[a]; + int B = cluster2rep[b]; + phrase2rep.Join(A, B); + int C = phrase2rep.GetRoot(A); // New rep phrase of cluster c (merged a and b) + + cluster2rep.erase(a); + cluster2rep.erase(b); + rep2cluster.erase(A); + rep2cluster.erase(B); + cluster2rep[c] = C; + rep2cluster[C] = c; + a = c; + } + else + a = b; + } + } + + compute_cluster_distribs(); + + // Merge clusters + ofstream out(map_file.c_str()); + forcmap(const string &, path, const StringVec &, words, SSVMap, map) { + forvec(_, const string &, word, words) { + int a = word2phrase(db.lookup(word.c_str(), false, -1)); + if(a == -1) continue; + + /*cout << a << ' ' << N << endl; + cout << Phrase(a) << endl; + cout << kl_map[0][a] << endl; + cout << kl_map[1][a] << endl; + cout << phrase_freqs[a] << endl;*/ + + out << Phrase(a) << '\t' + << path << "-L " << kl_map[0][a] << '\t' + << path << "-R " << kl_map[1][a] << '\t' + << path << "-freq " << phrase_freqs[a] << endl; + } + } +} + +void do_clustering() { + track("do_clustering()", "", true); + + compute_L2(); + repcheck(); + + // start the threads + thread_start = new mutex[num_threads]; + thread_idle = new mutex[num_threads]; + threads = new thread[num_threads]; + for (int ii=0; iifirst, 0, '\0')); + + while(!stack.empty()) { + // Take off a stack item (a node in the tree). + StackItem item = stack.back(); + int a = item.a; + int path_i = item.path_i; + if(item.ch) + path[path_i-1] = item.ch; + stack.pop_back(); + + // Look at the node's children (if any). + IntIntPairMap::const_iterator it = cluster_tree.find(a); + if(it == cluster_tree.end()) { + path[path_i] = '\0'; + if(out_paths) paths_out << path << '\t' << Phrase(a) << '\t' << phrase_freqs[a] << endl; + if(out_map) map_out << Phrase(a) << '\t' + << path << "-L " << kl_map[0][a] << '\t' + << path << "-R " << kl_map[1][a] << '\t' + << path << "-freq " << phrase_freqs[a] << endl; + } + else { + const IntPair &children = it->second; + // Only print out paths through the part of the tree constructed in stage 2. + bool extend = a >= stage2_cluster_offset; + int new_path_i = path_i + extend; + + stack.push_back(StackItem(children.second, new_path_i, extend ? '1' : '\0')); + stack.push_back(StackItem(children.first, new_path_i, extend ? '0' : '\0')); + } + } +} + +int main(int argc, char *argv[]) { + init_opt(argc, argv); + + /* Bind value of a to c if user chose that alias instead. C, as + * originally implemented, serves the purpose of a in Generalised Brown. + */ + if( initA != initC && initA != INPUT_DEFAULT_A ) { initC = initA; } + + assert(file_exists(text_file.c_str())); + + // Set output_dir from arguments. + if(output_dir.empty()) { + output_dir = file_base(strip_dir(text_file)); + output_dir += str_printf("-c%d", initC); + output_dir += str_printf("-p%d", plen); + if(!restrict_file.empty()) output_dir += str_printf("-R%s", file_base(strip_dir(restrict_file)).c_str()); + output_dir += ".out"; + } + + if(system(("mkdir -p " + output_dir).c_str()) != 0) + assert2(false, "Can't create " << output_dir); + if(system(("rm -f " + output_dir + "/*").c_str()) != 0) + assert2(false, "Can't remove things in " << output_dir); + + // Set arguments from the output_dir. + if(!output_dir.empty()) { + if(paths_file.empty()) paths_file = output_dir+"/paths"; + if(map_file.empty()) map_file = output_dir+"/map"; + if(collocs_file.empty()) collocs_file = output_dir+"/collocs"; + if(log_info.log_file.empty()) log_info.log_file = output_dir+"/log"; + if(!print_stats) { + merge_log.open( output_dir + "/merges" ); /* cluster.py compatability. */ + } + } + + init_log; + + track_mem(db); + track_mem(phrase_freqs); + track_mem(left_phrases); + track_mem(right_phrases); + track_mem(cluster_tree); + track_mem(freq_order_phrases); + track_mem(phrase2rep); + track_mem(rep2cluster); + track_mem(cluster2rep); + track_mem(phrases); + track_mem(slot2cluster); + track_mem(cluster2slot); + track_mem(p1); + track_mem(p2); + track_mem(q2); + track_mem(L2); + omp_set_num_threads(num_threads); + + read_restrict_text(); + read_text(); + if(featvec_file.empty()) { + if(paths2map) + convert_paths_to_map(); + else if(!print_stats) { + create_initial_clusters(); + output_best_collocations(); + do_clustering(); + output_cluster_paths(); + } + } + + return 0; +}