From c1ac445149acd117a184563deaccb1948d668970 Mon Sep 17 00:00:00 2001 From: Moritz Kiehn Date: Tue, 24 Apr 2018 11:50:41 +0200 Subject: [PATCH 1/5] README: some wording and clarification --- README.md | 69 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 42 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 0618c9e..05ee3ff 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,9 @@ -Tracking machine learning challenge (TrackML) utility library -============================================================= +TrackML utility library +======================= -A python library to simplify working with the dataset of the tracking machine -learning challenge. +A python library to simplify working with the +[High Energy Physics Tracking Machine Learning challenge](kaggle_trackml) +dataset. Installation ------------ @@ -50,9 +51,10 @@ for event_id, hits, cells, particles, truth in load_dataset('path/to/dataset'): ... ``` -Each event is lazily loaded during the iteration. Options are available to -read only a subset of available events or only read selected parts, e.g. only -hits or only particles. +The dataset path can be the path to a directory or to a zip file containing the +events csv files. Each event is lazily loaded during the iteration. Options are +available to read only a subset of available events or only read selected parts, +e.g. only hits or only particles. To generate a random test submission from truth information and compute the expected score: @@ -65,8 +67,8 @@ shuffled = shuffle_hits(truth, 0.05) # 5% probability to reassign a hit score = score_event(truth, shuffled) ``` -All methods either take or return `pandas.DataFrame` objects. Please have a look -at the function docstrings for detailed documentation. +All methods either take or return `pandas.DataFrame` objects. You can have a +look at the function docstrings for detailed information. Authors ------- @@ -95,8 +97,10 @@ hits, their truth association to particles, and the initial parameters of those particles. The test dataset contains only the recorded hits. The dataset is provided as a set of plain `.csv` files ('.csv.gz' or '.csv.bz2' -are also allowed)'. Each event has four associated files that contain hits, -hit cells, particles, and the ground truth association between them. The common prefix (like `event000000000`) is fully constrained to be `event` followed by 9 digits. +are also allowed)'. Each event has four associated files that contain hits, hit +cells, particles, and the ground truth association between them. The common +prefix (like `event000000000`) is fully constrained to be `event` followed by 9 +digits. event000000000-hits.csv event000000000-cells.csv @@ -132,15 +136,17 @@ are given here to simplify detector-specific data handling. ### Event hit cells The cells file contains the constituent active detector cells that comprise each -hit. A cell is the smallest granularity inside each detector module, much like a pixel on a screen, except that depending on the volume_id a cell can be a square or a long rectangle. It is -identified by two channel identifiers that are unique within each detector -module and encode the position, much like row/column numbers of a matrix. A cell can provide signal information that the -detector module has recorded in addition to the position. Depending on the -detector type only one of the channel identifiers is valid, e.g. for the strip -detectors, and the value might have different resolution. +hit. A cell is the smallest granularity inside each detector module, much like a +pixel on a screen, except that depending on the volume_id a cell can be a square +or a long rectangle. It is identified by two channel identifiers that are unique +within each detector module and encode the position, much like column/row +numbers of a matrix. A cell can provide signal information that the detector +module has recorded in addition to the position. Depending on the detector type +only one of the channel identifiers is valid, e.g. for the strip detectors, and +the value might have different resolution. * **hit_id**: numerical identifier of the hit as defined in the hits file. -* **ch0, ch1**: channel identifier/coordinates unique with one module. +* **ch0, ch1**: channel identifier/coordinates unique within one module. * **value**: signal value information, e.g. how much charge a particle has deposited. @@ -149,7 +155,8 @@ detectors, and the value might have different resolution. The particles files contains the following values for each particle/entry: * **particle_id**: numerical identifier of the particle inside the event. -* **vx, vy, vz**: initial position (in millimeters) (vertex) in global coordinates. +* **vx, vy, vz**: initial position or vertex (in millimeters) in global + coordinates. * **px, py, pz**: initial momentum (in GeV/c) along each global axis. * **q**: particle charge (as multiple of the absolute electron charge). * **nhits**: number of hits generated by this particle @@ -165,23 +172,31 @@ particle/track. * **hit_id**: numerical identifier of the hit as defined in the hits file. * **particle_id**: numerical identifier of the generating particle as defined in the particles file. -* **tx, ty, tz** true intersection point in global coordinates (in millimeters) between - the particle trajectory and the sensitive surface. -* **tpx, tpy, tpz** true particle momentum (in GeV/c) in the global coordinate system - at the intersection point. The corresponding unit vector is tangent to the particle trajectory. +* **tx, ty, tz** true intersection point in global coordinates (in + millimeters) between the particle trajectory and the sensitive surface. +* **tpx, tpy, tpz** true particle momentum (in GeV/c) in the global + coordinate system at the intersection point. The corresponding vector + is tangent to the particle trajectory at the intersection point. * **weight** per-hit weight used for the scoring metric; total sum of weights within one event equals to one. ### Dataset submission information -The submission file must associate each hit in each event to one and only one reconstructed particle track. The reconstructed tracks must be uniquely identified only within each event. Participants are advised to compress the submission file (with zip, bzip2, gzip) before submission to Kaggle site. +The submission file must associate each hit in each event to one and only one +reconstructed particle track. The reconstructed tracks must be uniquely +identified only within each event. Participants are advised to compress the +submission file (with zip, bzip2, gzip) before submission to +[Kaggle site](kaggle_trackml). * **event_id**: numerical identifier of the event; corresponds to the number found in the per-event file name prefix. -* **hit_id**: numerical identifier (non negative integer) of the hit inside the event as defined in the per-event hits file. -* **track_id**: user defined numerical identifier (non negative integer) of the track +* **hit_id**: numerical identifier of the hit inside the event as defined in + the per-event hits file. +* **track_id**: user-defined numerical identifier (non-negative integer) of + the track -[cern]: https://home.cern/ +[cern]: https://home.cern [lhc]: https://home.cern/topics/large-hadron-collider [mit_license]: http://www.opensource.org/licenses/MIT +[kaggle_trackml]: https://www.kaggle.com/c/trackml-particle-identification From 3c8966a87ac36bb6f07c274e10a0818cd2a15cdc Mon Sep 17 00:00:00 2001 From: Moritz Kiehn Date: Tue, 24 Apr 2018 12:00:13 +0200 Subject: [PATCH 2/5] update package version and url --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 2bc9030..ce3eeec 100644 --- a/setup.py +++ b/setup.py @@ -11,11 +11,11 @@ setup( name='trackml', - version='1b0', + version='1b1', description='TrackML utility library', long_description=long_description, long_description_content_type='text/markdown', - # url='TODO', + url='https://github.com/LAL/trackml-library', author='Moritz Kiehn', # TODO who else author_email='msmk@cern.ch', # TODO or mailing list classifiers=[ From 9c40750b298bd55b0ccf2b5028217e9990232dad Mon Sep 17 00:00:00 2001 From: Moritz Kiehn Date: Tue, 24 Apr 2018 18:34:37 +0200 Subject: [PATCH 3/5] README: format fixes --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 05ee3ff..3bf9dad 100644 --- a/README.md +++ b/README.md @@ -52,9 +52,9 @@ for event_id, hits, cells, particles, truth in load_dataset('path/to/dataset'): ``` The dataset path can be the path to a directory or to a zip file containing the -events csv files. Each event is lazily loaded during the iteration. Options are -available to read only a subset of available events or only read selected parts, -e.g. only hits or only particles. +events `.csv` files. Each event is lazily loaded during the iteration. Options +are available to read only a subset of available events or only read selected +parts, e.g. only hits or only particles. To generate a random test submission from truth information and compute the expected score: @@ -96,8 +96,8 @@ some hits can be left unassigned). The training dataset contains the recorded hits, their truth association to particles, and the initial parameters of those particles. The test dataset contains only the recorded hits. -The dataset is provided as a set of plain `.csv` files ('.csv.gz' or '.csv.bz2' -are also allowed)'. Each event has four associated files that contain hits, hit +The dataset is provided as a set of plain `.csv` files (`.csv.gz` or `.csv.bz2` +are also allowed). Each event has four associated files that contain hits, hit cells, particles, and the ground truth association between them. The common prefix (like `event000000000`) is fully constrained to be `event` followed by 9 digits. @@ -185,7 +185,7 @@ particle/track. The submission file must associate each hit in each event to one and only one reconstructed particle track. The reconstructed tracks must be uniquely identified only within each event. Participants are advised to compress the -submission file (with zip, bzip2, gzip) before submission to +submission file (with zip, bzip2, gzip) before submission to the [Kaggle site](kaggle_trackml). * **event_id**: numerical identifier of the event; corresponds to the number From 785a6ffc392028d3d5567d79a8bc1a2e54ee85d1 Mon Sep 17 00:00:00 2001 From: Moritz Kiehn Date: Tue, 24 Apr 2018 18:35:18 +0200 Subject: [PATCH 4/5] weights: use nhits from particle input --- trackml/weights.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/trackml/weights.py b/trackml/weights.py index e32f011..0469b97 100644 --- a/trackml/weights.py +++ b/trackml/weights.py @@ -87,18 +87,20 @@ def weight_hits(truth, particles): truth : pandas.DataFrame Truth information. Must have hit_id, particle_id, and tz columns. particles : pandas.DataFrame - Particle information. Must have particle_id, vz, px, and py columns. + Particle information. Must have particle_id, vz, px, py, and nhits + columns. Returns ------- pandas.DataFrame - `truth` augmented with additional columns: ihit, nhits, weight_order, - weight_pt, and weight. + `truth` augmented with additional columns: particle_nhits, ihit, + weight_order, weight_pt, and weight. """ # fill selected per-particle information for each hit selected = pandas.DataFrame({ 'particle_id': particles['particle_id'], 'particle_vz': particles['vz'], + 'particle_nhits': particles['nhits'], 'weight_pt': weight_pt(numpy.hypot(particles['px'], particles['py'])), }) combined = pandas.merge(truth, selected, @@ -107,15 +109,14 @@ def weight_hits(truth, particles): # fix pt weight for hits w/o associated particle combined['weight_pt'].fillna(0.0, inplace=True) - + # fix nhits for hits w/o associated particle + combined['particle_nhits'].fillna(0.0, inplace=True) + combined['particle_nhits'] = combined['particle_nhits'].astype('i4') # compute hit count and order using absolute distance from particle vertex combined['abs_dvz'] = numpy.absolute(combined['tz'] - combined['particle_vz']) - combined['nhits'] = combined.groupby('particle_id')['abs_dvz'].transform(numpy.size).astype('i4') - combined.loc[combined['particle_id'] == INVALID_PARTICLED_ID, 'nhits'] = 0 combined['ihit'] = combined.groupby('particle_id')['abs_dvz'].rank().transform(lambda x: x - 1).fillna(0.0).astype('i4') - # compute order-dependent weight - combined['weight_order'] = combined[['ihit', 'nhits']].apply(weight_order, axis=1) + combined['weight_order'] = combined[['ihit', 'particle_nhits']].apply(weight_order, axis=1) # compute combined weight normalized to 1 w = combined['weight_pt'] * combined['weight_order'] From 8727bf23c6cfb95001e9b19752d3329058112434 Mon Sep 17 00:00:00 2001 From: Moritz Kiehn Date: Tue, 24 Apr 2018 18:42:33 +0200 Subject: [PATCH 5/5] increase to version 1 --- setup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index ce3eeec..5421b2a 100644 --- a/setup.py +++ b/setup.py @@ -11,15 +11,15 @@ setup( name='trackml', - version='1b1', + version='1', description='TrackML utility library', long_description=long_description, long_description_content_type='text/markdown', url='https://github.com/LAL/trackml-library', - author='Moritz Kiehn', # TODO who else - author_email='msmk@cern.ch', # TODO or mailing list + author='Moritz Kiehn', + author_email='msmk@cern.ch', classifiers=[ - 'Development Status :: 4 - Beta', # TODO update for first release + 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Science/Research', 'Topic :: Scientific/Engineering :: Information Analysis', 'Topic :: Scientific/Engineering :: Physics',