From c1ac445149acd117a184563deaccb1948d668970 Mon Sep 17 00:00:00 2001
From: Moritz Kiehn <msmk@cern.ch>
Date: Tue, 24 Apr 2018 11:50:41 +0200
Subject: [PATCH 1/5] README: some wording and clarification

---
 README.md | 69 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 42 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index 0618c9e..05ee3ff 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,9 @@
-Tracking machine learning challenge (TrackML) utility library
-=============================================================
+TrackML utility library
+=======================
 
-A python library to simplify working with the dataset of the tracking machine
-learning challenge.
+A python library to simplify working with the
+[High Energy Physics Tracking Machine Learning challenge](kaggle_trackml)
+dataset.
 
 Installation
 ------------
@@ -50,9 +51,10 @@ for event_id, hits, cells, particles, truth in load_dataset('path/to/dataset'):
     ...
 ```
 
-Each event is lazily loaded during the iteration. Options are available to
-read only a subset of available events or only read selected parts, e.g. only
-hits or only particles.
+The dataset path can be the path to a directory or to a zip file containing the
+events csv files. Each event is lazily loaded during the iteration. Options are
+available to read only a subset of available events or only read selected parts,
+e.g. only hits or only particles.
 
 To generate a random test submission from truth information and compute the
 expected score:
@@ -65,8 +67,8 @@ shuffled = shuffle_hits(truth, 0.05) # 5% probability to reassign a hit
 score = score_event(truth, shuffled)
 ```
 
-All methods either take or return `pandas.DataFrame` objects. Please have a look
-at the function docstrings for detailed documentation.
+All methods either take or return `pandas.DataFrame` objects. You can have a
+look at the function docstrings for detailed information.
 
 Authors
 -------
@@ -95,8 +97,10 @@ hits, their truth association to particles, and the initial parameters of those
 particles. The test dataset contains only the recorded hits.
 
 The dataset is provided as a set of plain `.csv` files ('.csv.gz' or '.csv.bz2'
-are also allowed)'. Each event has four associated files that contain hits,
-hit cells, particles, and the ground truth association between them. The common prefix (like `event000000000`) is fully constrained to be `event` followed by 9 digits.
+are also allowed)'. Each event has four associated files that contain hits, hit
+cells, particles, and the ground truth association between them. The common
+prefix (like `event000000000`) is fully constrained to be `event` followed by 9
+digits.
 
     event000000000-hits.csv
     event000000000-cells.csv
@@ -132,15 +136,17 @@ are given here to simplify detector-specific data handling.
 ### Event hit cells
 
 The cells file contains the constituent active detector cells that comprise each
-hit. A cell is the smallest granularity inside each detector module, much like a pixel on a screen, except that depending on the volume_id a cell can be a square or a long rectangle. It is
-identified by two channel identifiers that are unique within each detector
-module and encode the position, much like row/column numbers of a matrix. A cell can provide signal information that the
-detector module has recorded in addition to the position. Depending on the
-detector type only one of the channel identifiers is valid, e.g. for the strip
-detectors, and the value might have different resolution.
+hit. A cell is the smallest granularity inside each detector module, much like a
+pixel on a screen, except that depending on the volume_id a cell can be a square
+or a long rectangle. It is identified by two channel identifiers that are unique
+within each detector module and encode the position, much like column/row
+numbers of a matrix. A cell can provide signal information that the detector
+module has recorded in addition to the position. Depending on the detector type
+only one of the channel identifiers is valid, e.g. for the strip detectors, and
+the value might have different resolution.
 
 *   **hit_id**: numerical identifier of the hit as defined in the hits file.
-*   **ch0, ch1**: channel identifier/coordinates unique with one module.
+*   **ch0, ch1**: channel identifier/coordinates unique within one module.
 *   **value**: signal value information, e.g. how much charge a particle has
     deposited.
 
@@ -149,7 +155,8 @@ detectors, and the value might have different resolution.
 The particles files contains the following values for each particle/entry:
 
 *   **particle_id**: numerical identifier of the particle inside the event.
-*   **vx, vy, vz**: initial position (in millimeters) (vertex) in global coordinates.
+*   **vx, vy, vz**: initial position or vertex (in millimeters) in global
+    coordinates.
 *   **px, py, pz**: initial momentum (in GeV/c) along each global axis.
 *   **q**: particle charge (as multiple of the absolute electron charge).
 *   **nhits**: number of hits generated by this particle
@@ -165,23 +172,31 @@ particle/track.
 *   **hit_id**: numerical identifier of the hit as defined in the hits file.
 *   **particle_id**: numerical identifier of the generating particle as defined
     in the particles file.
-*   **tx, ty, tz** true intersection point in global coordinates (in millimeters) between
-    the particle trajectory and the sensitive surface.
-*   **tpx, tpy, tpz** true particle momentum (in GeV/c) in the global coordinate system
-    at the intersection point. The corresponding unit vector is tangent to the particle trajectory.
+*   **tx, ty, tz** true intersection point in global coordinates (in
+    millimeters) between the particle trajectory and the sensitive surface.
+*   **tpx, tpy, tpz** true particle momentum (in GeV/c) in the global
+    coordinate system at the intersection point. The corresponding vector
+    is tangent to the particle trajectory at the intersection point.
 *   **weight** per-hit weight used for the scoring metric; total sum of weights
     within one event equals to one.
 
 ### Dataset submission information
 
-The submission file must associate each hit in each event to one and only one reconstructed particle track. The reconstructed tracks must be uniquely identified only within each event.  Participants are advised to compress the submission file (with zip, bzip2, gzip) before submission to Kaggle site. 
+The submission file must associate each hit in each event to one and only one
+reconstructed particle track. The reconstructed tracks must be uniquely
+identified only within each event.  Participants are advised to compress the
+submission file (with zip, bzip2, gzip) before submission to
+[Kaggle site](kaggle_trackml).
 
 *   **event_id**: numerical identifier of the event; corresponds to the number
     found in the per-event file name prefix.
-*   **hit_id**: numerical identifier (non negative integer) of the hit inside the event as defined in the per-event hits file.
-*   **track_id**: user defined numerical identifier (non negative integer) of the track 
+*   **hit_id**: numerical identifier of the hit inside the event as defined in
+    the per-event hits file.
+*   **track_id**: user-defined numerical identifier (non-negative integer) of
+    the track
 
 
-[cern]: https://home.cern/
+[cern]: https://home.cern
 [lhc]: https://home.cern/topics/large-hadron-collider
 [mit_license]: http://www.opensource.org/licenses/MIT
+[kaggle_trackml]: https://www.kaggle.com/c/trackml-particle-identification

From 3c8966a87ac36bb6f07c274e10a0818cd2a15cdc Mon Sep 17 00:00:00 2001
From: Moritz Kiehn <msmk@cern.ch>
Date: Tue, 24 Apr 2018 12:00:13 +0200
Subject: [PATCH 2/5] update package version and url

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 2bc9030..ce3eeec 100644
--- a/setup.py
+++ b/setup.py
@@ -11,11 +11,11 @@
 
 setup(
     name='trackml',
-    version='1b0',
+    version='1b1',
     description='TrackML utility library',
     long_description=long_description,
     long_description_content_type='text/markdown',
-    # url='TODO',
+    url='https://github.com/LAL/trackml-library',
     author='Moritz Kiehn', # TODO who else
     author_email='msmk@cern.ch', # TODO or mailing list
     classifiers=[

From 9c40750b298bd55b0ccf2b5028217e9990232dad Mon Sep 17 00:00:00 2001
From: Moritz Kiehn <msmk@cern.ch>
Date: Tue, 24 Apr 2018 18:34:37 +0200
Subject: [PATCH 3/5] README: format fixes

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 05ee3ff..3bf9dad 100644
--- a/README.md
+++ b/README.md
@@ -52,9 +52,9 @@ for event_id, hits, cells, particles, truth in load_dataset('path/to/dataset'):
 ```
 
 The dataset path can be the path to a directory or to a zip file containing the
-events csv files. Each event is lazily loaded during the iteration. Options are
-available to read only a subset of available events or only read selected parts,
-e.g. only hits or only particles.
+events `.csv` files. Each event is lazily loaded during the iteration. Options
+are available to read only a subset of available events or only read selected
+parts, e.g. only hits or only particles.
 
 To generate a random test submission from truth information and compute the
 expected score:
@@ -96,8 +96,8 @@ some hits can be left unassigned). The training dataset contains the recorded
 hits, their truth association to particles, and the initial parameters of those
 particles. The test dataset contains only the recorded hits.
 
-The dataset is provided as a set of plain `.csv` files ('.csv.gz' or '.csv.bz2'
-are also allowed)'. Each event has four associated files that contain hits, hit
+The dataset is provided as a set of plain `.csv` files (`.csv.gz` or `.csv.bz2`
+are also allowed). Each event has four associated files that contain hits, hit
 cells, particles, and the ground truth association between them. The common
 prefix (like `event000000000`) is fully constrained to be `event` followed by 9
 digits.
@@ -185,7 +185,7 @@ particle/track.
 The submission file must associate each hit in each event to one and only one
 reconstructed particle track. The reconstructed tracks must be uniquely
 identified only within each event.  Participants are advised to compress the
-submission file (with zip, bzip2, gzip) before submission to
+submission file (with zip, bzip2, gzip) before submission to the
 [Kaggle site](kaggle_trackml).
 
 *   **event_id**: numerical identifier of the event; corresponds to the number

From 785a6ffc392028d3d5567d79a8bc1a2e54ee85d1 Mon Sep 17 00:00:00 2001
From: Moritz Kiehn <msmk@cern.ch>
Date: Tue, 24 Apr 2018 18:35:18 +0200
Subject: [PATCH 4/5] weights: use nhits from particle input

---
 trackml/weights.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/trackml/weights.py b/trackml/weights.py
index e32f011..0469b97 100644
--- a/trackml/weights.py
+++ b/trackml/weights.py
@@ -87,18 +87,20 @@ def weight_hits(truth, particles):
     truth : pandas.DataFrame
         Truth information. Must have hit_id, particle_id, and tz columns.
     particles : pandas.DataFrame
-        Particle information. Must have particle_id, vz, px, and py columns.
+        Particle information. Must have particle_id, vz, px, py, and nhits
+        columns.
 
     Returns
     -------
     pandas.DataFrame
-        `truth` augmented with additional columns: ihit, nhits, weight_order,
-        weight_pt, and weight.
+        `truth` augmented with additional columns: particle_nhits, ihit,
+        weight_order, weight_pt, and weight.
     """
     # fill selected per-particle information for each hit
     selected = pandas.DataFrame({
         'particle_id': particles['particle_id'],
         'particle_vz': particles['vz'],
+        'particle_nhits': particles['nhits'],
         'weight_pt': weight_pt(numpy.hypot(particles['px'], particles['py'])),
     })
     combined = pandas.merge(truth, selected,
@@ -107,15 +109,14 @@ def weight_hits(truth, particles):
 
     # fix pt weight for hits w/o associated particle
     combined['weight_pt'].fillna(0.0, inplace=True)
-
+    # fix nhits for hits w/o associated particle
+    combined['particle_nhits'].fillna(0.0, inplace=True)
+    combined['particle_nhits'] = combined['particle_nhits'].astype('i4')
     # compute hit count and order using absolute distance from particle vertex
     combined['abs_dvz'] = numpy.absolute(combined['tz'] - combined['particle_vz'])
-    combined['nhits'] = combined.groupby('particle_id')['abs_dvz'].transform(numpy.size).astype('i4')
-    combined.loc[combined['particle_id'] == INVALID_PARTICLED_ID, 'nhits'] = 0
     combined['ihit'] = combined.groupby('particle_id')['abs_dvz'].rank().transform(lambda x: x - 1).fillna(0.0).astype('i4')
-
     # compute order-dependent weight
-    combined['weight_order'] = combined[['ihit', 'nhits']].apply(weight_order, axis=1)
+    combined['weight_order'] = combined[['ihit', 'particle_nhits']].apply(weight_order, axis=1)
 
     # compute combined weight normalized to 1
     w = combined['weight_pt'] * combined['weight_order']

From 8727bf23c6cfb95001e9b19752d3329058112434 Mon Sep 17 00:00:00 2001
From: Moritz Kiehn <msmk@cern.ch>
Date: Tue, 24 Apr 2018 18:42:33 +0200
Subject: [PATCH 5/5] increase to version 1

---
 setup.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index ce3eeec..5421b2a 100644
--- a/setup.py
+++ b/setup.py
@@ -11,15 +11,15 @@
 
 setup(
     name='trackml',
-    version='1b1',
+    version='1',
     description='TrackML utility library',
     long_description=long_description,
     long_description_content_type='text/markdown',
     url='https://github.com/LAL/trackml-library',
-    author='Moritz Kiehn', # TODO who else
-    author_email='msmk@cern.ch', # TODO or mailing list
+    author='Moritz Kiehn',
+    author_email='msmk@cern.ch',
     classifiers=[
-        'Development Status :: 4 - Beta', # TODO update for first release
+        'Development Status :: 5 - Production/Stable',
         'Intended Audience :: Science/Research',
         'Topic :: Scientific/Engineering :: Information Analysis',
         'Topic :: Scientific/Engineering :: Physics',