From aac3c5fe8e3ea357eb708bfaef113a516baa685d Mon Sep 17 00:00:00 2001 From: "Michael J.T. O'Kelly" Date: Tue, 1 Apr 2014 00:16:35 -0700 Subject: [PATCH] -Ported Wabbit Wappa from parent project -Project framework via cookiecutter --- .gitignore | 42 +++++ AUTHORS.rst | 13 ++ CONTRIBUTING.rst | 109 +++++++++++++ HISTORY.rst | 9 ++ LICENSE | 20 +++ MANIFEST.in | 5 + Makefile | 56 +++++++ README.rst | 79 +++++++++ Vagrantfile | 121 ++++++++++++++ requirements.txt | 5 + scripts/bashrc.sh | 5 + scripts/provision.sh | 15 ++ scripts/vw-install.sh | 15 ++ setup.cfg | 2 + setup.py | 58 +++++++ test/__init__.py | 0 test/test_wabbit_wappa.py | 19 +++ wabbit_wappa/__init__.py | 328 ++++++++++++++++++++++++++++++++++++++ 18 files changed, 901 insertions(+) create mode 100644 .gitignore create mode 100644 AUTHORS.rst create mode 100644 CONTRIBUTING.rst create mode 100644 HISTORY.rst create mode 100644 LICENSE create mode 100644 MANIFEST.in create mode 100644 Makefile create mode 100644 README.rst create mode 100644 Vagrantfile create mode 100644 requirements.txt create mode 100644 scripts/bashrc.sh create mode 100644 scripts/provision.sh create mode 100644 scripts/vw-install.sh create mode 100644 setup.cfg create mode 100644 setup.py create mode 100644 test/__init__.py create mode 100644 test/test_wabbit_wappa.py create mode 100644 wabbit_wappa/__init__.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..53eacf5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,42 @@ +*.py[cod] + +# C extensions +*.so + +# Packages +*.egg +*.egg-info +dist +build +eggs +parts +bin +var +sdist +develop-eggs +.installed.cfg +lib +lib64 + +# Installer logs +pip-log.txt + +# Unit test / coverage reports +.coverage +.tox +nosetests.xml + +# Translations +*.mo + +# Mr Developer +.mr.developer.cfg +.project +.pydevproject + +# Complexity +output/*.html +output/*/index.html + +# Sphinx +docs/_build \ No newline at end of file diff --git a/AUTHORS.rst b/AUTHORS.rst new file mode 100644 index 0000000..af94b54 --- /dev/null +++ b/AUTHORS.rst @@ -0,0 +1,13 @@ +======= +Credits +======= + +Development Lead +---------------- + +* Michael J.T. O'Kelly + +Contributors +------------ + +None yet. Why not be the first? \ No newline at end of file diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst new file mode 100644 index 0000000..1d0db67 --- /dev/null +++ b/CONTRIBUTING.rst @@ -0,0 +1,109 @@ +============ +Contributing +============ + +Contributions are welcome, and they are greatly appreciated! Every +little bit helps, and credit will always be given. + +You can contribute in many ways: + +Types of Contributions +---------------------- + +Report Bugs +~~~~~~~~~~~ + +Report bugs at https://github.com/mokelly/wabbit_wappa/issues. + +If you are reporting a bug, please include: + +* Your operating system name and version. +* Any details about your local setup that might be helpful in troubleshooting. +* Detailed steps to reproduce the bug. + +Fix Bugs +~~~~~~~~ + +Look through the GitHub issues for bugs. Anything tagged with "bug" +is open to whoever wants to implement it. + +Implement Features +~~~~~~~~~~~~~~~~~~ + +Look through the GitHub issues for features. Anything tagged with "feature" +is open to whoever wants to implement it. + +Write Documentation +~~~~~~~~~~~~~~~~~~~ + +Wabbit Wappa could always use more documentation, whether as part of the +official Wabbit Wappa docs, in docstrings, or even on the web in blog posts, +articles, and such. + +Submit Feedback +~~~~~~~~~~~~~~~ + +The best way to send feedback is to file an issue at https://github.com/mokelly/wabbit_wappa/issues. + +If you are proposing a feature: + +* Explain in detail how it would work. +* Keep the scope as narrow as possible, to make it easier to implement. +* Remember that this is a volunteer-driven project, and that contributions + are welcome :) + +Get Started! +------------ + +Ready to contribute? Here's how to set up `wabbit_wappa` for +local development. + +1. Fork_ the `wabbit_wappa` repo on GitHub. +2. Clone your fork locally:: + + $ git clone git@github.com:your_name_here/wabbit_wappa.git + +3. Create a branch for local development:: + + $ git checkout -b name-of-your-bugfix-or-feature + +Now you can make your changes locally. + +4. When you're done making changes, check that your changes pass style and unit + tests, including testing other Python versions with tox:: + + $ tox + +To get tox, just pip install it. + +5. Commit your changes and push your branch to GitHub:: + + $ git add . + $ git commit -m "Your detailed description of your changes." + $ git push origin name-of-your-bugfix-or-feature + +6. Submit a pull request through the GitHub website. + +.. _Fork: https://github.com/Nekroze/wabbit_wappa/fork + +Pull Request Guidelines +----------------------- + +Before you submit a pull request, check that it meets these guidelines: + +1. The pull request should include tests. +2. If the pull request adds functionality, the docs should be updated. Put + your new functionality into a function with a docstring, and add the + feature to the list in README.rst. +3. The pull request should work for Python 2.6, 2.7, and 3.3, and for PyPy. + Check https://travis-ci.org/mokelly/wabbit_wappa + under pull requests for active pull requests or run the ``tox`` command and + make sure that the tests pass for all supported Python versions. + + +Tips +---- + +To run a subset of tests:: + + $ py.test test/test_wabbit_wappa.py \ No newline at end of file diff --git a/HISTORY.rst b/HISTORY.rst new file mode 100644 index 0000000..00cf7f6 --- /dev/null +++ b/HISTORY.rst @@ -0,0 +1,9 @@ +.. :changelog: + +History +------- + +0.0.1 (2014-03-31) +++++++++++++++++++ + +* First release on GitHub \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f8283eb --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ +The MIT License (MIT) + +Copyright (c) 2014 Michael J.T. O'Kelly + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..417f37b --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,5 @@ +include AUTHORS.rst +include CONTRIBUTING.rst +include HISTORY.rst +include LICENSE +include README.rst \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..5bde3de --- /dev/null +++ b/Makefile @@ -0,0 +1,56 @@ +.PHONY: help clean clean-pyc clean-build list test test-all coverage docs release sdist + +help: + @echo "clean-build - remove build artifacts" + @echo "clean-pyc - remove Python file artifacts" + @echo "lint - check style with flake8" + @echo "test - run tests quickly with the default Python" + @echo "testall - run tests on every Python version with tox" + @echo "coverage - check code coverage quickly with the default Python" + @echo "docs - generate Sphinx HTML documentation, including API docs" + @echo "release - package and upload a release" + @echo "sdist - package" + +clean: clean-build clean-pyc + +clean-build: + rm -fr build/ + rm -fr dist/ + rm -fr *.egg-info + +clean-pyc: + find . -name '*.pyc' -exec rm -f {} + + find . -name '*.pyo' -exec rm -f {} + + find . -name '*~' -exec rm -f {} + + +lint: + flake8 wabbit_wappa test + +test: + py.test + +test-all: + tox + +coverage: + coverage run --source wabbit_wappa setup.py test + coverage report -m + coverage html + open htmlcov/index.html + +docs: + rm -f docs/wabbit_wappa.rst + rm -f docs/modules.rst + sphinx-apidoc -o docs/ wabbit_wappa + $(MAKE) -C docs clean + $(MAKE) -C docs html + open docs/_build/html/index.html + +release: clean + python setup.py sdist upload + python setup.py bdist_wheel upload + +sdist: clean + python setup.py sdist + python setup.py bdist_wheel upload + ls -l dist \ No newline at end of file diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..8270caf --- /dev/null +++ b/README.rst @@ -0,0 +1,79 @@ +############## +Wabbit Wappa +############## + +**Wabbit Wappa** is a full-featured Python wrapper for the lightning fast `Vorpal Wabbit `_ ("VW") machine learning utility. Wabbit Wappa makes it easy to use VW's powerful features while not dealing with its idiosyncratic syntax and interface. + +**************** +Features +**************** + +* Complete Pythonic wrapper for the Vorpal Wabbit training and test syntax +* Online training and testing, with no need to restart VW or reload the trained model to go between them +* Save the trained model on the fly + +**************** +Getting Started +**************** + +If you're unfamiliar with Vorpal Wabbit, this documentation is no substitute for the `detailed tutorials `_ + at the VW wiki. You'll eventually need to read those to understand VW's advanced features. + +Installation +=============== + +*Coming soon: install via Pip* + +Start by cloning the WW repository:: + + git clone https://github.com/mokelly/wabbit_wappa.git + cd wabbit_wappa + +You have three installation options, depending on your comfort with compiling and installing the VW utility. + +**If you already have Vorpal Wabbit installed**:: + + python setup.py install + +**If you still need to install VW and its dependencies**:: + + scripts/vw-install.sh + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib; + python setup.py install + +(The "export" line should be added to your .profile if you don't want to run it every time you use Vorpal Wabbit.) + +**If you want a virtual machine with everything all set up for you:** + +*Windows users, this is your only option at present* + +First install the virtual machine manager `Vagrant `_ along with your favorite virtualization system (such as `VirtualBox `_). +Then from the Wabbit Wappa source directory type:: + + vagrant up + +This will launch an Ubuntu VM and provision it with VW and WW, completely automatically! Once that's all complete, just SSH to your new VM with:: + + vagrant ssh + +Testing +--------- + +Make sure everything is installed and configured correctly by running the tests:: + + py.test + +Usage Example +=============== + + *TODO* + +**************** +Documentation +**************** + +For now, read the docstrings:: + + import wabbit_wappa + help(wabbit_wappa) + diff --git a/Vagrantfile b/Vagrantfile new file mode 100644 index 0000000..4262975 --- /dev/null +++ b/Vagrantfile @@ -0,0 +1,121 @@ +# -*- mode: ruby -*- +# vi: set ft=ruby : + +# Vagrantfile API/syntax version. Don't touch unless you know what you're doing! +VAGRANTFILE_API_VERSION = "2" + +Vagrant.configure(VAGRANTFILE_API_VERSION) do |config| + # All Vagrant configuration is done here. The most common configuration + # options are documented and commented below. For a complete reference, + # please see the online documentation at vagrantup.com. + + # Every Vagrant virtual environment requires a box to build off of. + config.vm.box = "quantal64" + + # The url from where the 'config.vm.box' box will be fetched if it + # doesn't already exist on the user's system. + config.vm.box_url = "http://cloud-images.ubuntu.com/vagrant/quantal/current/quantal-server-cloudimg-amd64-vagrant-disk1.box" + + # Create a forwarded port mapping which allows access to a specific port + # within the machine from a port on the host machine. In the example below, + # accessing "localhost:8080" will access port 80 on the guest machine. + # config.vm.network "forwarded_port", guest: 80, host: 8080 + + # Create a private network, which allows host-only access to the machine + # using a specific IP. + # config.vm.network "private_network", ip: "192.168.33.10" + + # Create a public network, which generally matched to bridged network. + # Bridged networks make the machine appear as another physical device on + # your network. + # config.vm.network "public_network" + + # If true, then any SSH connections made will enable agent forwarding. + # Default value: false + # config.ssh.forward_agent = true + + # Share an additional folder to the guest VM. The first argument is + # the path on the host to the actual folder. The second argument is + # the path on the guest to mount the folder. And the optional third + # argument is a set of non-required options. + # config.vm.synced_folder "../data", "/vagrant_data" + + # Provider-specific configuration so you can fine-tune various + # backing providers for Vagrant. These expose provider-specific options. + # Example for VirtualBox: + # + # config.vm.provider "virtualbox" do |vb| + # # Don't boot with headless mode + # vb.gui = true + # + # # Use VBoxManage to customize the VM. For example to change memory: + # vb.customize ["modifyvm", :id, "--memory", "1024"] + # end + # + # View the documentation for the provider you're using for more + # information on available options. + + # Enable provisioning with Puppet stand alone. Puppet manifests + # are contained in a directory path relative to this Vagrantfile. + # You will need to create the manifests directory and a manifest in + # the file hashicorp/quantal64.pp in the manifests_path directory. + # + # An example Puppet manifest to provision the message of the day: + # + # # group { "puppet": + # # ensure => "present", + # # } + # # + # # File { owner => 0, group => 0, mode => 0644 } + # # + # # file { '/etc/motd': + # # content => "Welcome to your Vagrant-built virtual machine! + # # Managed by Puppet.\n" + # # } + # + # config.vm.provision "puppet" do |puppet| + # puppet.manifests_path = "manifests" + # puppet.manifest_file = "site.pp" + # end + + # Enable provisioning with chef solo, specifying a cookbooks path, roles + # path, and data_bags path (all relative to this Vagrantfile), and adding + # some recipes and/or roles. + # + # config.vm.provision "chef_solo" do |chef| + # chef.cookbooks_path = "../my-recipes/cookbooks" + # chef.roles_path = "../my-recipes/roles" + # chef.data_bags_path = "../my-recipes/data_bags" + # chef.add_recipe "mysql" + # chef.add_role "web" + # + # # You may also specify custom JSON attributes: + # chef.json = { :mysql_password => "foo" } + # end + + # Enable provisioning with chef server, specifying the chef server URL, + # and the path to the validation key (relative to this Vagrantfile). + # + # The Opscode Platform uses HTTPS. Substitute your organization for + # ORGNAME in the URL and validation key. + # + # If you have your own Chef Server, use the appropriate URL, which may be + # HTTP instead of HTTPS depending on your configuration. Also change the + # validation key to validation.pem. + # + # config.vm.provision "chef_client" do |chef| + # chef.chef_server_url = "https://api.opscode.com/organizations/ORGNAME" + # chef.validation_key_path = "ORGNAME-validator.pem" + # end + # + # If you're using the Opscode platform, your validator client is + # ORGNAME-validator, replacing ORGNAME with your organization name. + # + # If you have your own Chef Server, the default validation client name is + # chef-validator, unless you changed the configuration. + # + # chef.validation_client_name = "ORGNAME-validator" + + # Provisioning via shell script + config.vm.provision :shell, :path => "scripts/provision.sh" +end diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b922f0c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +pexpect +pytest +cookiecutter +wheel>=0.22 +pip>=1.4 \ No newline at end of file diff --git a/scripts/bashrc.sh b/scripts/bashrc.sh new file mode 100644 index 0000000..1722d00 --- /dev/null +++ b/scripts/bashrc.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +# Make sure library path includes install location of Vowpal Wabbit libraries +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib; + diff --git a/scripts/provision.sh b/scripts/provision.sh new file mode 100644 index 0000000..c69b1da --- /dev/null +++ b/scripts/provision.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +cd ~ +sudo apt-get update +sudo apt-get -yqqu install git +sudo apt-get -yqqu install python-pip +sudo apt-get -yqqu install python-dev +sudo apt-get -yqqu install make +sudo pip install -r /vagrant/requirements.txt +/vagrant/scripts/vw-install.sh + +cd /vagrant + +cat /vagrant/scripts/bashrc.sh >> /home/vagrant/.bashrc +sudo python setup.py install diff --git a/scripts/vw-install.sh b/scripts/vw-install.sh new file mode 100644 index 0000000..d963094 --- /dev/null +++ b/scripts/vw-install.sh @@ -0,0 +1,15 @@ +#!/bin/bash +sudo apt-get -yqqu install libboost-all-dev + +wget https://github.com/JohnLangford/vowpal_wabbit/archive/7.5.tar.gz +tar -zxvf 7.5.tar.gz +cd vowpal_wabbit-7.5 + +./autogen.sh +./configure +make +sudo make install +# LD_LIBRARY_PATH is necessary for vw to find required .so file(s) +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib; + + diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..0a8df87 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[wheel] +universal = 1 \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..ac2cf6b --- /dev/null +++ b/setup.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python + +import os +import sys + +try: + from setuptools import setup +except ImportError: + from distutils.core import setup + +from pip.req import parse_requirements + +if sys.argv[-1] == 'publish': + os.system('python setup.py sdist upload') + sys.exit() + +# http://stackoverflow.com/questions/14399534/how-can-i-reference-requirements-txt-for-the-install-requires-kwarg-in-setuptool +install_reqs = parse_requirements('requirements.txt') +req_list = [str(ir.req) for ir in install_reqs] + +readme = open('README.rst').read() +doclink = """ +Documentation +------------- + +The full documentation is at http://wabbit_wappa.rtfd.org.""" +history = open('HISTORY.rst').read().replace('.. :changelog:', '') + +setup( + name='wabbit_wappa', + version='0.0.1', + description='Wabbit Wappa is a full-featured Python wrapper for the Vorpal Wabbit machine learning utility.', + long_description=readme + '\n\n' + doclink + '\n\n' + history, + author="Michael J.T. O'Kelly", + author_email='mokelly@gmail.com', + url='https://github.com/mokelly/wabbit_wappa', + packages=[ + 'wabbit_wappa', + ], + package_dir={'wabbit_wappa': 'wabbit_wappa'}, + include_package_data=True, + install_requires=req_list, + license='MIT', + zip_safe=False, + keywords='wabbit_wappa', + classifiers=[ + 'Development Status :: 2 - Pre-Alpha', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: MIT License', + 'Natural Language :: English', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.6', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: Implementation :: PyPy', + ], +) diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/test_wabbit_wappa.py b/test/test_wabbit_wappa.py new file mode 100644 index 0000000..c1f2c9d --- /dev/null +++ b/test/test_wabbit_wappa.py @@ -0,0 +1,19 @@ + +from wabbit_wappa import * + + +def test_namespace(): + namespace = Namespace('MetricFeatures', 3.28, [('height', 1.5), ('length', 2.0)]) + namespace_string = namespace.to_string() + assert namespace_string == 'MetricFeatures:3.28 height:1.5 length:2.0 ' + + namespace = Namespace(None, 3.28, ['height', 'length']) + namespace_string = namespace.to_string() + assert namespace_string == ' height length ' + + namespace = Namespace('Metric Features', 3.28, [('height|', 1.5), ('len:gth', 2.0)]) + namespace_string = namespace.to_string() + assert 'Metric Features' not in namespace_string + assert '|' not in namespace_string + assert 'len:gth' not in namespace_string + diff --git a/wabbit_wappa/__init__.py b/wabbit_wappa/__init__.py new file mode 100644 index 0000000..49d004f --- /dev/null +++ b/wabbit_wappa/__init__.py @@ -0,0 +1,328 @@ +""" +Wrapper for Vowpal Wabbit executable + +TODO: +-Unit tests. Reproduce examples from wiki and from + http://hunch.net/~vw/validate.html +-Documentation. Use mkdocs and something like picnic to put + this on PyPI in elegant style. +-Command line generation. Beginner's mode for common scenarios + (and a framework to build from). Make up names for single-char + args and those missing documentation (like -i and -f) and accept + short or long form. +-Performance testing. How much time is spent in wappa vs. VW? +-Detect VW version in unit tests; for command line generation scenarios, + unit tests should detect whether it works as expected. +-Scenario assistance; e.g. caching examples for reuse in multi-pass + (This would make good example code also.) + -Abstraction for passes (with automatic usage of example cache) +-Example for README: Alternate train and testing to show how regressor converges over time +-Example for README: Active learning interface +-Sklearn compatibility (like vowpal_porpoise) +-Vagrant that builds and installs dependencies (including VW) automatically + Then run unit tests to verify install +-Use pexpect.which() to find executable automatically +-Handle echo mode introspectively. Include unit test in which it's switched manually. +-Installable with pip + + + + +by Michael J.T. O'Kelly, 2014-2-24 +""" + +__author__ = "Michael J.T. O'Kelly" +__email__ = 'mokelly@gmail.com' +__version__ = '0.0.1' + + +import logging +import re + +import pexpect + + +class WabbitInvalidCharacter(ValueError): + pass + + +validation_regex = re.compile(r' |:|\|') + +def validate_vw_string(s): + """Throw a WabbitInvalidCharacter exception if the string is + not a + (http://stats.stackexchange.com/questions/28877/finding-the-best-features-in-interaction-models) + """ + if validation_regex.search(s): + raise WabbitInvalidCharacter(s) + + +escape_dict = {' ': r'\_', + ':': r'\;', + '|': r'\\' + } + +def escape_vw_character(special_character_re_match): + special_character = special_character_re_match.group() + return escape_dict[special_character] + + +def escape_vw_string(s): + escaped_s = validation_regex.sub(escape_vw_character, s) + return escaped_s + + +class Namespace(): + """Abstraction of Namespace part of VW example lines""" + def __init__(self, + name=None, + scale=None, + features=None, + escape=True, + validate=True, + cache_string=False): + """Create a namespace with given (optional) name and importance, + initialized with any given features (described in add_features()). + If 'validate', name and features are validated for compatibility + with VW's reserved characters, throwing a WabbitInvalidCharacter + exception. + If 'escape', any invalid characters are replaced with escape characters. + ('escape' mode supersedes 'vaildate' mode.) + If 'cache_string', the results of any to_string() call are cached + permanently, ignoring any further changes to self. + """ + self.name = name + self.scale = scale + self.validate = validate + self.escape = escape + self._string = None + self.features = [] + if name: + if escape: + self.name = escape_vw_string(self.name) + elif validate: + validate_vw_string(self.name) + if features: + self.add_features(features) + + def add_features(self, features): + """Add features to this namespace. + features: An iterable of features. A feature may be either + 1) A VW label (not containing characters from escape_dict.keys(), + unless 'escape' mode is on) + 2) A tuple (label, value) where value is any float + """ + for feature in features: + if isinstance(feature, basestring): + label = feature + value = None + else: + label, value = feature + self.add_feature(label, value) + + def add_feature(self, label, value=None): + """ + label: A VW label (not containing characters from escape_dict.keys(), + unless 'escape' mode is on) + value: float giving the weight or magnitude of this feature + """ + if self.escape: + label = escape_vw_string(label) + elif self.validate: + validate_vw_string(label) + feature = (label, value) + self.features.append(feature) + + def to_string(self): + """Export this namespace to a string suitable for incorporation + in a VW example line, e.g. + 'MetricFeatures:3.28 height:1.5 length:2.0 ' + """ + if self._string is None: + tokens = [] + if self.name: + if self.scale: + token = self.name + ':' + str(self.scale) + else: + token = self.name + else: + token = '' # Spacing element to indicate next string is a feature + tokens.append(token) + for label, value in self.features: + if value is None: + token = label + else: + token = label + ':' + str(value) + tokens.append(token) + tokens.append('') # Spacing element to separate from next pipe character + output = ' '.join(tokens) + else: + output = self._string + return output + + +class VW(): + """Wrapper for VW executable, handling online input and outputs.""" + def __init__(self, command, raw_output=False): + """'command' is the full command-line necessary to run VW. E.g. + vw --loss_function logistic -p /dev/stdout --quiet + -p /dev/stdout --quiet is mandatory for compatibility, + and certain options like + --save_resume + are suggested, while some options make no sense in this context: + -d + --passes + wabbit_wappa.py does not support any mode that turns off piping to + stdin or stdout + + raw_output: Instead of returning parsed float(s) as output, return + the string literal. + """ + self.vw_process = pexpect.spawn(command) + # TODO: Use spawn(args=args) for more fine-grained control + self.vw_process.delaybeforesend = 0 + logging.info("Started VW({})".format(command)) + self.output_pipe = None + self.command = command + self.namespaces = [] + self._line = None + self.set_raw_output(raw_output) + + def send_line(self, line): + """Submit a raw line of text to the VW instance, returning the result. + """ + self.vw_process.sendline(line) # Send line, along with newline + result = self._get_response() + return result + + def set_raw_output(self, raw_output): + """Set the value of raw_output, which determines whether VW output + is parsed into float(s) or returned literally.""" + self.raw_output = raw_output + return self.raw_output + + def _get_response(self): + self.vw_process.expect('\r\n') # Wait until process outputs a complete line + self.vw_process.expect('\r\n') # Wait until process outputs a complete line twice + # Grabbing two lines seems to be necessary because vw_process.getecho() is True + output = self.vw_process.before + if self.raw_output: + result_value = output # Return the output unchanged + else: + result_list = [] + # TODO: Something more robust than whitespace splitting + # to handle modes like --audit ? + for token in output.split(): + try: + result = float(token) + result_list.append(result) + except ValueError: + # Ignore tokens that can't be made into floats (like tags) + logging.debug("Ignoring non-float token {}".format(token)) + if len(result_list) == 1: + result_value = result_list[0] + elif len(result_list) > 1: + result_value = result_list + else: + # If no floats were found, return the unparsed output + # TODO: Should an exception be raised here instead? + result_value = output + return result_value + + def send_example(self, + *args, + **kwargs + ): + line = self.make_line(*args, **kwargs) + result = self.send_line(line) + return result + + def make_line(self, + response=None, + importance=None, + base=None, + tag=None, + features=None, + namespaces=None, + ): + if namespaces is not None: + self.add_namespaces(namespaces) + if features is not None: + namespace = Namespace(features=features) + self.add_namespace(namespace) + substrings = [] + tokens = [] + if response is not None: + token = str(response) + tokens.append(token) + if importance is not None: # Check only if response is given + token = str(importance) + tokens.append(token) + if base is not None: # Check only if importance is given + token = str(base) + tokens.append(token) + if tag is not None: + token = "'" + str(tag) # Tags are unambiguous if given a ' prefix + tokens.append(token) + else: + token = "" # Spacing element to avoid ambiguity in parsing + tokens.append(token) + substring = ' '.join(tokens) + substrings.append(substring) + if self.namespaces: + for namespace in self.namespaces: + substring = namespace.to_string() + substrings.append(substring) + else: + substrings.append('') # For correct syntax + line = '|'.join(substrings) + self._line = line + self.namespaces = [] # Reset namespaces after their use + return line + + def add_namespace(self, *args, **kwargs): + """Accepts two calling patterns: + add_namespace(namespace): queue a preexisting namespace onto + this VW instance. + add_namespace(name, scale, features, ...): Pass all args and kwargs + to the Namespace constructor to make a new Namespace instance, + and queue it to this VW instance. + + Returns self (so that this command can be chained). + """ + if args and isinstance(args[0], Namespace): + namespace = args[0] + elif isinstance(kwargs.get('namespace'), Namespace): + namespace = kwargs.get('namespace') + else: + namespace = Namespace(*args, **kwargs) + self.namespaces.append(namespace) + return self + + def add_namespaces(self, namespaces): + """Add these namespaces sequentially. + Returns self (so that this command can be chained).""" + for namespace in namespaces: + self.add_namespace(namespace) + return self + + def get_prediction(self, tag=None, namespaces=None): + result = self.send_example(tag=tag, namespaces=namespaces) + return result + + def save_model(self, model_filename): + """Pass a "command example" to the VW subprocess requesting + that the current model be serialized to model_filename immediately.""" + line = "save_{}|".format(model_filename) + self.vw_process.sendline(line) + self.vw_process.expect('\r\n') # Wait until process outputs a complete line + # Only the echo will be emitted as a result for this command + result = self.vw_process.before + return result + + def close(self): + self.vw_process.close() + # TODO: Give this a context manager interface + + # TODO: Fancy interface for auditing data? +