diff --git a/.gitignore b/.gitignore index b6e4761..320900a 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,6 @@ dmypy.json # Pyre type checker .pyre/ +data/aisdk_20170701.csv +data/aisdk_20180101.csv +*.pkl diff --git a/README.md b/README.md index ce8d1a0..b856b74 100644 --- a/README.md +++ b/README.md @@ -1 +1,7 @@ -# EDA-protocol-movement \ No newline at end of file +# A protocol for identifying problems in continuous movement data + +This notebook provides an open-source implementation of the protocol presented in the paper: + +Graser, A. (2021) An exploratory data analysis protocol for identifying problems in continuous movement data. *Journal of Location Based Services.* http://dx.doi.org/10.1080/17489725.2021.1900612. + +The individual protocol steps are demonstrated using a dataset of vessel tracking data (AIS) published by the Danish Maritime Authority. The demo data covers two days (July, 1st 2017 and January, 1st 2018). Since the datasets are too large for Github, they have been made available via Figshare: https://doi.org/10.6084/m9.figshare.11577543 diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000..9013eb9 --- /dev/null +++ b/data/README.md @@ -0,0 +1 @@ +Since the datasets are too large for Github, they have been made available via Figshare: https://doi.org/10.6084/m9.figshare.11577543 diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..26d1a79 --- /dev/null +++ b/environment.yml @@ -0,0 +1,22 @@ +name: eda4m +channels: + - conda-forge + - default +dependencies: + - python=3.7 + - numpy + - cython + - matplotlib + - seaborn + - pandas + - geopandas + - rasterio + - hvplot + - bokeh + - proj + - cartopy + - geoviews + - scikit-learn + - geopy + - panel + - movingpandas diff --git a/protocol.ipynb b/protocol.ipynb new file mode 100644 index 0000000..8130579 --- /dev/null +++ b/protocol.ipynb @@ -0,0 +1,1334 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# A protocol for identifying problems in continuous movement data \n", + "\n", + "This notebook provides an open-source implementation of the protocol presented in the paper: \n", + "\n", + "Graser, A. (2021) An exploratory data analysis protocol for identifying problems in continuous movement data. *Journal of Location Based Services.* http://dx.doi.org/10.1080/17489725.2021.1900612.\n", + "\n", + "The individual protocol steps are demonstrated using a dataset of vessel tracking data (AIS) published by the Danish Maritime Authority. The demo data covers two days (July, 1st 2017 and January, 1st 2018). Since the datasets are too large for Github, they have been made available via Figshare: https://doi.org/10.6084/m9.figshare.11577543\n", + "\n", + "\n", + "## Content\n", + "\n", + "- [Setup](#Setup)\n", + "- [A. Missing data](#A.-Missing-data)\n", + "- [B. Precision problems](#B.-Precision-problems)\n", + "- [C. Consistency problems](#C.-Consistency-problems)\n", + "- [D. Accuracy problems](#D.-Accuracy-problems)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "Before running this notebook, make sure to [download](https://doi.org/10.6084/m9.figshare.11577543) *dk_csv_20170701.7z* and *dk_csv_20180101.7z* and unzip the files into the data directory:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "input_files = [\n", + " './data/aisdk_20170701.csv',\n", + " './data/aisdk_20180101.csv'\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.core.display import display, HTML\n", + "display(HTML(\"\"))\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "FIGSIZE = (600,400)\n", + "SMSIZE = 300\n", + "COLOR = 'darkblue'\n", + "COLOR_HIGHLIGHT = 'red'\n", + "COLOR_BASE = 'grey'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from math import sin, cos, atan2, radians, degrees, sqrt, pi\n", + "from datetime import datetime, date\n", + "import numpy as np\n", + "import pandas as pd\n", + "import geopandas as gpd\n", + "import movingpandas as mpd\n", + "import datashader as ds\n", + "import holoviews as hv\n", + "from shapely.geometry import Point, LineString\n", + "from holoviews.operation.datashader import datashade, spread\n", + "from holoviews.element import tiles\n", + "from holoviews import opts, dim \n", + "import hvplot\n", + "import movingpandas as mp\n", + "from shapely.geometry import Point\n", + "\n", + "R_EARTH = 6371000 # radius of earth in meters\n", + "C_EARTH = 2 * R_EARTH * pi # circumference\n", + "BG_TILES = tiles.CartoLight()\n", + "\n", + "pd.set_option('use_inf_as_na', True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_single_mover(df, mover_id, the_date):\n", + " tmp = df[(df.id==mover_id) & (df.index.date==the_date)]\n", + " gdf = gpd.GeoDataFrame(tmp.drop(['x', 'y'], axis=1), crs={'init': 'epsg:3857'}, geometry=[Point(xy) for xy in zip(tmp.x, tmp.y)])\n", + " plot = mp.Trajectory(gdf, 1).hvplot(title=f'Mover {mover_id} ({the_date})', c='speed_m/s', cmap='RdYlBu', colorbar=True, clim=(0,15), \n", + " line_width=5, width=FIGSIZE[0], height=FIGSIZE[1], tiles='CartoLight')\n", + " return plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(input_files[0], nrows=100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df['SOG'].hist(bins=100, figsize=(15,3))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = None\n", + "for input_file in input_files[:2]: \n", + " a = pd.read_csv(input_file, usecols=['# Timestamp', 'MMSI', 'Latitude', 'Longitude', 'SOG', 'Type of mobile', 'Ship type', 'Navigational status'])\n", + " a = a[(a['Type of mobile'] == 'Class A') & (a.SOG>0)]\n", + " a.drop(columns=['Type of mobile', 'SOG'], inplace=True)\n", + " if df is None:\n", + " df = a\n", + " else:\n", + " df = df.append(a)\n", + " \n", + "df.rename(columns={'# Timestamp':'time', 'MMSI':'id', 'Latitude':'lat', 'Longitude':'lon', 'Ship type':'shiptype', 'Navigational status':'navstat'}, inplace=True)\n", + "df['time'] = pd.to_datetime(df['time'], format='%d/%m/%Y %H:%M:%S')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[:, 'x'], df.loc[:, 'y'] = ds.utils.lnglat_to_meters(df.lon, df.lat)\n", + "\n", + "df.set_index('time', inplace=True)\n", + "\n", + "df['navstat'] = df['navstat'].astype('category')\n", + "df['shiptype'] = df['shiptype'].astype('category')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('Number of records: {} million'.format(round(len(df)/1000000)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## A. Missing data\n", + "\n", + "Checking for missing data is a common starting point for exploring new movement datasets. Missing data may indicate issues with the data collection process or the data export used to generate the analysis dataset. At this early stage, we usually start with raw location records that have yet to be aggregated into trajectories. Therefore, initial analyses look at elementary position records. The following protocol steps target issues of missing data with respect to movement data's spatial, temporal, and attribute dimensions. \n", + "\n", + "\n", + "### A-1. Spatial gaps & outliers\n", + "\n", + "To gain an overview, the visual analysis should start from the whole time span before drilling down. Spatial context (usually in the form of base maps) is essential when assessing spatial extent and gaps because context influences movement." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Spatial spread / extent & outliers \n", + "\n", + "This step addresses the question if the dataset covers the expected spatial extent. This can be as simple as checking the minimum and maximum coordinate values of the raw records. However, it is not uncommon to encounter spurious location records or outliers that are not representative of the actual covered extent. These outliers may be truly erroneous positions but can also be correct positions that happen to be located outside the usual extent. Looking at elementary position records only, it is usually not possible to distinguish these two cases. It is therefore necessary to take note of these outliers and investigate further in later steps.\n", + "\n", + "Unexpected spatial extent can have different consequences, depending on whether the extent is too small, too large or covering a wrong area. If the area of interest to the analyst's work is not covered, it may be necessary to go back to the data collection phase. If the extent is larger than expected, it can cause excessive processing run times. For example, if the dataset is to be rasterized with a fixed target raster cell size, an unexpected large extent will result in a larger than expected raster. Additionally, any outliers that exceed the valid coordinate ranges, for example, for latitude and longitude values, should inform the analyst to proceed with caution and stay alert regarding other potential data quality issues. \n", + "\n", + "Classic scatter plots (or point maps) are helpful at this step. Point density maps (often called heat maps) on their default settings tend to hide outliers and are therefore not recommended. Interactive map visualizations should be preferred to static maps since interaction capabilities (such as zooming and panning) enable a quicker assessment of the situation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f'Spatial extent: x_min={df.lon.min()}, x_max={df.lon.max()}, y_min={df.lat.min()}, y_max={df.lat.max()}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_basic_scatter(df, color='darkblue', title='', width=FIGSIZE[0], height=FIGSIZE[1], size=2):\n", + " opts.defaults(opts.Overlay(active_tools=['wheel_zoom']))\n", + " pts = df.hvplot.scatter(x='x', y='y', datashade=True, cmap=[color, color], frame_width=width, frame_height=height, title=str(title))\n", + " return BG_TILES * spread(pts, px=size)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_basic_scatter(df, title='Spatial extent & outliers')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Optional cropping of outliers**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = df[(df.lon>-90) & (df.lon<90) & (df.lat>0) & (df.lat<80)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cropped_df = df[(df.lon>0) & (df.lon<20) & (df.lat>52) & (df.lat<60)]\n", + "cropped_df['navstat'] = cropped_df['navstat'].astype('category')\n", + "cropped_df['shiptype'] = cropped_df['shiptype'].astype('category')\n", + "plot_basic_scatter(cropped_df, title='Cropped dataset')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Spatial gaps (selected areas / all movers / whole time span)\n", + "\n", + "This step addresses the question if there are spatial gaps in the data coverage. Depending on the type of movers, gaps in certain spatial contexts are to be expected. For example, we wouldn't expect taxi locations in lakes. Therefore, it is essential to evaluate these gaps in their spatial context using base maps showing relevant geographic features, such as the road network for vehicle data or navigation markers for vessel data. \n", + "\n", + "Unexpected spatial gaps are particularly problematic if they affect the area of interest to the analyst's work. However, even if the area of interest is not affected, spatial gaps may indicate systematic issues with the data collection process, such as observation gaps, that require addressing to achieve continuous coverage. \n", + " \n", + "Point density maps are helpful since they make it easy to identify areas with low densities, ignoring occasional outliers. The visualization scale influences which size of gaps can be discovered. However, there are of course practical limitations to exploring ever more detailed scales and resulting continuously growing numbers of gaps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_point_density(df, title='', width=FIGSIZE[0], height=FIGSIZE[1]):\n", + " opts.defaults(opts.Overlay(active_tools=['wheel_zoom']))\n", + " pts = df.hvplot.scatter(x='x', y='y', title=str(title), datashade=True, frame_width=width, frame_height=height)\n", + " return BG_TILES * pts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_point_density(df, title='Spatial gaps')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### A-2. Temporal gaps & outliers\n", + "\n", + "#### Temporal extent & outliers (whole territory / all movers / whole time span)\n", + "\n", + "This step addresses the question if the dataset covers the expected temporal extent. Similar to exploring the spatial extent, the obvious step is to determine the minimum and maximum timestamps first. Since GPS tracking requires accurate clocks to function, time information on the tracker is usually reliable. However, it is not guaranteed that these timestamps make it through the whole data collection and (pre)processing chain leading up to the exploratory analysis. For example, in some cases, tracker (or sender) time is replaced by receiver or storage time. Thus clock errors on the receiving or storage devices can result in unexpected timestamps.\n", + "\n", + "Undiscovered timestamp issues can affect the derived temporal extent, as well as all other temporal and spatiotemporal analyses. A typical problem (especially when working with CSV data sources) is erroneous parsing of dates, for example, switching the digits for day and month when they cannot be inferred unambiguously. This leads to wrong temporal assignments, such as records from Feb, 3rd being assigned to March, 2nd. In other datasets, time information may be provided in the form of an offset value from a certain starting time. In this case, the record timestamps have to be reconstructed in order to put the movement data in context with other spatiotemporal data. \n", + "\n", + "Temporal charts, particularly record counts over time, are helpful to gain a first impression of the overall temporal extent and whether it is continuous or split into multiple time frames with little or no data in between.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f'Temporal extent: {df.index.min()} to {df.index.max()}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TIME_SAMPLE = '15min'\n", + "\n", + "df['id'].resample(TIME_SAMPLE).count()\\\n", + " .hvplot(title=f'Number of records per {TIME_SAMPLE}', width=FIGSIZE[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Temporal gaps in linear sequence & temporal cycles (whole territory / all movers / time spans)\n", + "\n", + "This step addresses the question if there are temporal gaps in the dataset. Temporal gaps can be due to scheduled breaks in data collection, deliberate choices during data export, as well as unintended issues during data collection or (pre)processing. Similar to exploring spatial gaps, the temporal scale influences which size of gaps can be discovered. Temporal gaps can be one-time events or exhibit reoccurring patterns. For example, daily and weekly cycles are typical for human movement data.\n", + "\n", + "Undiscovered one-time, as well as reoccurring temporal gaps can be benign if they reflect mover behavior. However, if they are caused by systematic errors in the data collection or (pre)processing workflows, they can affect the validity of analysis and model results. For example, a data-driven model may predict low mover density based on its erroneous training data even though the real movement situation may be different.\n", + "\n", + "Two-dimensional time histograms are particularly helpful to discover reoccurring temporal gaps. Typical temporal cycles that may be worth exploring include daily, weekly, monthly, and yearly cycles. As already discussed with regards to spatial gaps, the discovery of temporal gaps is also affected by scale, that is, the temporal binning influences which size of gaps can be discovered. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "counts_df = df['id'].groupby([df.index.hour, pd.Grouper(freq='d')]).count().to_frame(name='n')\n", + "counts_df.rename_axis(['hour', 'day'], inplace=True)\n", + "counts_df.hvplot.heatmap(title='Record count', x='hour', y='day', C='n', width=FIGSIZE[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### A-3. Spatiotemporal changes / gaps\n", + "\n", + "While the previous two steps looked at spatial gaps over the whole time span or temporal gaps for the whole territory, this step aims to explore spatiotemporal changes and gaps.\n", + "\n", + "#### Changing extent\n", + "\n", + "This step addresses the question whether there are changes in spatial extent over time. Changing spatial extent may be due to planned extensions or reductions of the data collection / observation area. Similarly, the extent is also expected to shift if the movers collectively change their location, as is the case, for example, with tracks of migrating birds.\n", + "\n", + "Unexpected changing or shifting extents mean that data may not be available for the full temporal extent for the whole area. This can limit the area suitable for spatiotemporal analyses, such as trend detection or training predictive models. Alternatively, analysts need to chose analysis methods and models that can handle missing data. In this case, care has to be taken when interpreting results and comparing performance in different regions with different data availability. \n", + "\n", + "Small multiples provide a quick way to compare extents between different time spans. To make sure that outliers can be easily spotted, classical point maps should be preferred to density maps.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_multiple_by_day(df, day, **kwargs):\n", + " return plot_basic_scatter(df[df.index.date==day], title=day, width=SMSIZE, height=SMSIZE, **kwargs)\n", + " \n", + "def plot_multiples_by_day(df, **kwargs):\n", + " days = df.index.to_period('D').unique()\n", + " a = None\n", + " for a_day in days:\n", + " a_day = a_day.to_timestamp().date()\n", + " plot = plot_multiple_by_day(df, a_day, **kwargs)\n", + " if a is None: a = plot\n", + " else: a = a + plot\n", + " return a" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "plot_multiples_by_day(df).cols(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "plot_multiples_by_day(cropped_df).cols(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_multiple_by_hour_of_day(df, hour, fun):\n", + " return fun(df[df.index.hour==hour], title=hour, width=SMSIZE, height=SMSIZE)\n", + " \n", + "def plot_multiples_by_hour_of_day(df, hours=range(0,24), fun=plot_basic_scatter):\n", + " a = None\n", + " for hour in hours:\n", + " plot = plot_multiple_by_hour_of_day(df, hour, fun)\n", + " if a is None: a = plot\n", + " else: a = a + plot\n", + " return a" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "plot_multiples_by_hour_of_day(df, hours=[6,7,8,9]).cols(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Temporary gaps\n", + "\n", + "This step addresses the question whether there are temporary gaps in the overall spatial coverage. These local gaps may be one-time or reoccurring issues. Like temporary changes in the overall extent, temporary gaps can be due to mover behavior, as well as planned and unplanned changes of the data collection or (pre)processing workflows.\n", + "\n", + "Unexpected temporary local gaps can have similar consequences as temporal gaps. However, due to their localize nature they may be harder to spot and can therefore remain hidden for longer. Resulting delays in discovering data issues can be costly, for example, because time-intensive model training has to be repeated.\n", + " \n", + "Besides small multiples of density maps animated density maps can be helpful at this step. Care should be taken to ensure that the color map configuration is consistent between time frames, that is, that the minimum and maximum values do not change since, otherwise, the density maps are not comparable. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_multiples_by_hour_of_day(cropped_df, hours=[0,6,12,18], fun=plot_point_density).cols(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### A-4. Attribute gaps\n", + "\n", + "Some attributes may only be available during certain time spans / or in certain areas.\n", + "\n", + "#### Spatial attribute gaps\n", + "\n", + "This step addresses the question if there are areas with missing attribute data. Locally missing attribute data can be due to heterogeneous data collection setups.\n", + "\n", + "Unexpected changes in attribute coverage can severely limit the usefulness of affected records. If a certain attribute is essential for further analysis or modeling but is not available in all regions and cannot be inferred by any other means, the consequence of these attribute gaps can be as severe as spatial gaps with completely missing records. \n", + "\n", + "The methods used to explore spatial extent and gaps can be adopted to missing attribute data. Small multiples can be used to compare the spatial distribution of records with and without certain attribute values. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "CATEGORY = 'shiptype' \n", + "cats = df[CATEGORY].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cmap = {} \n", + "for cat in cats:\n", + " cmap[cat] = COLOR_BASE\n", + "cmap['Unknown value'] = COLOR_HIGHLIGHT\n", + "cmap['Undefined'] = COLOR_HIGHLIGHT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_categorized_scatter(df, cat, title='', width=SMSIZE, height=SMSIZE, cmap=cmap):\n", + " opts.defaults(opts.Overlay(active_tools=['wheel_zoom']))\n", + " pts = df.hvplot.scatter(x='x', y='y', datashade=True, by=cat, colormap=cmap, legend=True, frame_width=width, frame_height=height, title=str(title))\n", + " return BG_TILES * pts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "unknown = df[(df[CATEGORY]=='Unknown value') | (df[CATEGORY]=='Undefined')]\n", + "known = df[(df[CATEGORY]!='Unknown value') & (df[CATEGORY]!='Undefined')]\n", + "\n", + "( plot_categorized_scatter(df, CATEGORY, title='Categorized', width=SMSIZE, height=SMSIZE, cmap=cmap) + \n", + " plot_basic_scatter(unknown, COLOR_HIGHLIGHT, title=f'Unknown {CATEGORY} only', width=SMSIZE, height=SMSIZE, size=1) +\n", + " plot_basic_scatter(known, COLOR_BASE, title=f'Known {CATEGORY} only', width=SMSIZE, height=SMSIZE, size=1)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Temporal attribute gaps\n", + "\n", + "This step addresses the question if there are temporary gaps in attribute data. Changes to the data collection or (pre)processing workflow can affect which attributes are available during certain time spans. Temporary attribute gaps may be one-time or reoccurring issues.\n", + "\n", + "Temporary attribute gaps can have similar consequences as spatial attribute gaps. If the attribute gap was caused by a change in the (pre)processing workflow, the damage may be reversible if the original data is still available. \n", + "\n", + "The methods used to explore temporal extent and gaps can be adopted to missing attribute data. Temporal charts, as well as two-dimensional time histograms may be used to find attribute gaps in linear sequence or in temporal cycles.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_multiples_by_day(unknown, color='red').cols(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DATE = date(2017,7,1)\n", + "unknown['id'].where(unknown.index.date==DATE).dropna().resample(TIME_SAMPLE).count().hvplot(\n", + " title=f'Records per {TIME_SAMPLE} on {DATE}', frame_width=SMSIZE, color='red', frame_height=SMSIZE, ylim=(0,82000), label='unknown'\n", + ") * known['id'].where(known.index.date==DATE).dropna().resample(TIME_SAMPLE).count().hvplot(\n", + " color='gray', label='known'\n", + ") " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DATE = date(2018,1,1)\n", + "unknown['id'].where(unknown.index.date==DATE).dropna().resample(TIME_SAMPLE).count().hvplot(\n", + " title=f'Records per {TIME_SAMPLE} on {DATE}', frame_width=SMSIZE, color='red', frame_height=SMSIZE, ylim=(0,82000), label='unknown'\n", + ") * known['id'].where(known.index.date==DATE).dropna().resample(TIME_SAMPLE).count().hvplot(\n", + " color='gray', label='known'\n", + ") " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DATA PREPARATION: Computing segment information" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def time_difference(row):\n", + " t1 = row['prev_t']\n", + " t2 = row['t']\n", + " return (t2-t1).total_seconds()\n", + "\n", + "def speed_difference(row):\n", + " return row['speed_m/s'] - row['prev_speed']\n", + "\n", + "def acceleration(row):\n", + " if row['diff_t_s'] == 0:\n", + " return None\n", + " return row['diff_speed'] / row['diff_t_s']\n", + "\n", + "def spherical_distance(lon1, lat1, lon2, lat2):\n", + " delta_lat = radians(lat2 - lat1)\n", + " delta_lon = radians(lon2 - lon1)\n", + " a = sin(delta_lat/2) * sin(delta_lat/2) + cos(radians(lat1)) * cos(radians(lat2)) * sin(delta_lon/2) * sin(delta_lon/2)\n", + " c = 2 * atan2(sqrt(a), sqrt(1 - a))\n", + " dist = R_EARTH * c\n", + " return dist\n", + "\n", + "def distance_to_prev(row):\n", + " return spherical_distance(row['prev_lon'], row['prev_lat'], row['lon'], row['lat'])\n", + " \n", + "def distance_to_next(row):\n", + " return spherical_distance(row['next_lon'], row['next_lat'], row['lon'], row['lat'])\n", + "\n", + "def direction(row):\n", + " lon1, lat1, lon2, lat2 = row['prev_lon'], row['prev_lat'], row['lon'], row['lat']\n", + " lat1 = radians(lat1)\n", + " lat2 = radians(lat2)\n", + " delta_lon = radians(lon2 - lon1)\n", + " x = sin(delta_lon) * cos(lat2)\n", + " y = cos(lat1) * sin(lat2) - (sin(lat1) * cos(lat2) * cos(delta_lon))\n", + " initial_bearing = atan2(x, y)\n", + " initial_bearing = degrees(initial_bearing)\n", + " compass_bearing = (initial_bearing + 360) % 360\n", + " return compass_bearing\n", + "\n", + "def angular_difference(row):\n", + " diff = abs(row['prev_dir'] - row['dir'])\n", + " if diff > 180:\n", + " diff = abs(diff - 360)\n", + " return diff \n", + "\n", + "def compute_segment_info(df):\n", + " df = df.copy()\n", + " df['t'] = df.index\n", + " df['prev_t'] = df.groupby('id')['t'].shift()\n", + " df['diff_t_s'] = df.apply(time_difference, axis=1)\n", + " df['prev_lon'] = df.groupby('id')['lon'].shift()\n", + " df['prev_lat'] = df.groupby('id')['lat'].shift()\n", + " df['prev_x'] = df.groupby('id')['x'].shift()\n", + " df['prev_y'] = df.groupby('id')['y'].shift()\n", + " df['diff_x'] = df['x'] - df['prev_x']\n", + " df['diff_y'] = df['y'] - df['prev_y']\n", + " df['next_lon'] = df.groupby('id')['lon'].shift(-1)\n", + " df['next_lat'] = df.groupby('id')['lat'].shift(-1)\n", + " df['dist_prev_m'] = df.apply(distance_to_prev, axis=1)\n", + " df['dist_next_m'] = df.apply(distance_to_next, axis=1)\n", + " df['speed_m/s'] = df['dist_prev_m']/df['diff_t_s']\n", + " df['prev_speed'] = df.groupby('id')['speed_m/s'].shift()\n", + " df['diff_speed'] = df.apply(speed_difference, axis=1)\n", + " df['acceleration'] = df.apply(acceleration, axis=1)\n", + " df['dir'] = df.apply(direction, axis=1)\n", + " df['prev_dir'] = df.groupby('id')['dir'].shift()\n", + " df['diff_dir'] = df.apply(angular_difference, axis=1)\n", + " df = df.drop(columns=['prev_x', 'prev_y', 'next_lon', 'next_lat', 'prev_speed', 'prev_dir'])\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "try:\n", + " segment_df = pd.read_pickle('./segments.pkl')\n", + "except:\n", + " segment_df = compute_segment_info(cropped_df)\n", + " segment_df.to_pickle(\"./segments.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "easteregg = cropped_df[(cropped_df.id==636092484) | (cropped_df.id==636092478)]\n", + "easteregg['id'] = 1\n", + "segment_df = segment_df.append(compute_segment_info(easteregg))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### A-5. Gaps in trajectories\n", + "\n", + "\n", + "Gaps in movement tracks can be due to technical failure of the tracking device, the mover leaving the observable area, deliberate deactivation of the tracking device, or (pre)processing issues.\n", + "\n", + "Unexpected gaps in tracks can affect derived trajectory measures, particularly by underestimating total length and derived measures, such as speed. Additionally, some regions may be more prone to gaps, for example due to unreliable local coverage. Consequently, there may be a lack of reliable data for these regions. \n", + "\n", + "Line density maps can be used to explore the distribution of gaps in tracks. The key is to only plot long segments, that is connections between consecutive records that exceed a certain length. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "GAP_MIN = 10000\n", + "GAP_MAX = 100000\n", + "\n", + "segment_df['is_gap'] = ( (segment_df['dist_prev_m']>GAP_MIN) & (segment_df['dist_prev_m']GAP_MIN) & (segment_df['dist_next_m']0].hvplot.hist(bins=72, title='Histogram of directions')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### B-2. Timestamp imprecision \n", + "\n", + "This step addresses the question if timestamps have been truncated excessively. Imprecise timestamps are the result of undue truncation or rounding in the date collection or (pre)processing workflow. Truncation can result in duplicate/multiple position records of the same mover referring to the same time. \n", + "\n", + "Truncated timestamps can result in consecutive records with identical timestamps but different positions. This will result in zero-length time deltas between affected records and thus to division-by-zero errors when computing speeds. If positions are sparsely sampled, moderate truncation (for example, of milliseconds) will not result in multiple records with identical timestamps. However, derived speed values may still suffer from higher noise due to the imprecise representation of time between locations. \n", + "\n", + "Counts of records per timestamp and mover ID can help identify cases of excessively truncated timestamps. Histograms of the number of duplicate timestamps per mover ID show whether this issue affects all movers equally or if there are certain movers where this issue appears more frequently. Additional visualizations than can shed light on the issue of duplicate records include spatial plots (such as point and density maps) as well as temporal plots (such as two-dimensional time histograms) and spatiotemporal space-time cubes. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "non_zero_movement = segment_df[segment_df.dist_prev_m>0]\n", + "\n", + "n_per_id_t = non_zero_movement[['id', 't', 'x']].groupby(['id', 't']).count().reset_index()\n", + "n_per_id_t['x'].plot.hist(title='Counts of records per timestamp and mover ID', log=True)\n", + "#n_per_id_t.groupby('x').count().hvplot(title='Counts of records per timestamp and mover ID', y='id', logy=True) # line plot not ideal\n", + "#n_per_id_t['x'].hvplot.hist(title='Counts of records per timestamp and mover ID', logy=True) # upstream bug in log scale" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "duplicates_per_id = n_per_id_t[n_per_id_t.x>1].drop(columns=['t']).groupby(['id']).count().rename(columns={'x':'n'})\n", + "duplicates_per_id['n'].plot.hist(title='Count of duplicate timestamps per mover ID', log=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## C. Consistency problems\n", + "\n", + "Datasets may not be as consistent with regards to collection parameters and type of movers as analysts expect. These problems usually cannot be detected from elementary position records. Therefore, intermediate segments or overall trajectories are needed. The following protocol steps target issues of heterogeneous sampling intervals as well as unexpected heterogeneous mover types and tracker types. \n", + "\n", + "### C-1. Sampling heterogeneity\n", + "\n", + "This step addresses the question whether the sampling frequency is stable. Some tracking systems provide records at regular time intervals. Other systems have rule-based sampling strategies. For example, in the Automatic Identification System (AIS), updates are more frequent when objects move quickly than when they stand still. In other contexts, GPS trackers may skip positions during straight-line movement. Other systems work on a best-effort base with a target sampling interval that may be exceeded if the system is busy.\n", + "\n", + "Heterogeneous sampling intervals make datasets harder to analyze. Existing methods may expect regularly-sampled input data. Depending on the extent of the heterogeneity, it may be possible to resample the data. Failing that, the methods have to be adjusted to support irregular or mixed sampling intervals. \n", + "\n", + "Histograms of sampling intervals help determine whether sampling intervals are stable and, if yes, what the typical sampling interval is. If not, they show the range of observed sampling intervals.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "segment_df.diff_t_s.hvplot.hist(title='Histogram of intervals between consecutive records (in seconds)', bins=100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "segment_df[segment_df.diff_t_s<=120].diff_t_s.hvplot.hist(title='Histogram of intervals between consecutive records (in seconds)', bins=60)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Coordinate change plots can reveal previously mentioned resampling strategies. For example, if the resampling strategy is based on a certain minimum distance between records, there will be a hole in the center of the plot. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "segment_df.hvplot.scatter(title='Coordinate change plot', x='diff_x', y='diff_y', datashade=True, \n", + " xlim=(-1000,1000), ylim=(-1000,1000), frame_width=FIGSIZE[1], frame_height=FIGSIZE[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### C-2. Mover heterogeneity\n", + "\n", + "This step addresses the question whether the dataset contains heterogeneous types of movers. Datasets of human movement are expected to contain a mix of different transport modes. Other datasets, such as floating car data (FCD), are expected to be more heterogeneous, for example, to only contain car movements. However, errors in the collection process can invalidate this assumption. For example, if mobile (as opposed to built-in) trackers are used, they may be removed from vehicles and carried around by other means of transport. Other sources of heterogeneity are not due to errors but may still surprise the analysts. For example, AIS datasets also contain track from search and rescue vessels which include helicopters.\n", + "\n", + "An unexpected mix of mover types can affect the validity of analysis results and the performance of derived models. Certain movement statistics may be over or underestimated due to the presence of unexpected mover types. For example, the mean speed along a road may be underestimated if pedestrian tracks are mixed with vehicle tracks. Consequently, a travel time prediction algorithm may predict exaggerated travel times. Therefore, certain movers may have to be removed to ensure the validity of analysis and model results. \n", + "\n", + "Scatter plots of different combinations of trajectory characteristics, such as total length, mean speed, mean direction change, and typical acceleration can help gain a better understanding of how heterogeneous the movers in a dataset are. \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "non_zero_speed = segment_df[(segment_df['speed_m/s']>0.1)]\n", + "daily = non_zero_speed.groupby(['id', pd.Grouper(freq='d')]).agg({'dist_prev_m':'sum', 'speed_m/s':'median'}) \n", + "\n", + "daily.hvplot.scatter(title='Daily travelled distance over median speed (m/s)', x='dist_prev_m', y='speed_m/s', \n", + " hover_cols=['id','time'], frame_width=FIGSIZE[1], frame_height=FIGSIZE[1], alpha=0.3, \n", + " xlim=(-100000,1500000), ylim=(-10,100))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_paths(original_df, title='', add_bg=True, height=FIGSIZE[1], width=FIGSIZE[0]):\n", + " grouped = [df[['x','y']] for name, df in original_df.groupby(['id']) ]\n", + " path = hv.Path(grouped, kdims=['x','y'])\n", + " plot = datashade(path, cmap=COLOR_HIGHLIGHT).opts(title=title, frame_height=height, frame_width=width)\n", + " if add_bg:\n", + " return BG_TILES * plot\n", + " else: \n", + " return plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "speedsters = daily[daily['speed_m/s']>20].reset_index().id.unique()\n", + "speedsters = segment_df[segment_df.id.isin(speedsters)]\n", + "plot_paths(speedsters, title='Speedsters') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "daily.hvplot.scatter(\n", + " title='Daily distance over median speed (m/s)', x='dist_prev_m', y='speed_m/s', \n", + " hover_cols=['id','time'], frame_width=SMSIZE, frame_height=SMSIZE, alpha=0.3, xlim=(-200000,4500000), ylim=(-10,100)\n", + ") + plot_paths(\n", + " speedsters, title='Speedsters', height=SMSIZE, width=SMSIZE\n", + ") " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "longdist = daily[daily['dist_prev_m']>800000].reset_index().id.unique()\n", + "longdist = segment_df[segment_df.id.isin(longdist)]\n", + "plot_paths(longdist, title='Long distance travelers')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DATA PREPARATION: Computing trajectory information" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MINIMUM_NUMBER_OF_RECORDS = 100\n", + "MINIMUM_SPEED_MS = 1\n", + "\n", + "def reset_values_at_daybreaks(tmp, columns):\n", + " tmp['ix'] = tmp.index\n", + " tmp['zero'] = 0\n", + " ix_first = tmp.groupby(['id', pd.Grouper(freq='d')]).first()['ix']\n", + " for col in columns:\n", + " tmp[col] = tmp['zero'].where(tmp['ix'].isin(ix_first), tmp[col])\n", + " tmp = tmp.drop(columns=['zero', 'ix'])\n", + " return tmp\n", + "\n", + "def compute_traj_info(segment_df):\n", + " tmp = segment_df.copy()\n", + " tmp['acceleration_abs'] = np.abs(tmp['acceleration'])\n", + " tmp['diff_speed_abs'] = np.abs(tmp['diff_speed'])\n", + " tmp = tmp.replace([np.inf, -np.inf], np.nan)\n", + "\n", + " tmp = reset_values_at_daybreaks(tmp, ['diff_t_s','dist_prev_m','diff_speed_abs','acceleration_abs'])\n", + "\n", + " traj_df = tmp.groupby(['id', pd.Grouper(freq='d')]) \\\n", + " .agg({'diff_t_s':['median', 'sum'], \n", + " 'speed_m/s':['median','std'],\n", + " 'diff_dir':['median','std'], \n", + " 'dist_prev_m':['median', 'sum'], \n", + " 'diff_speed_abs':['max'], \n", + " 'acceleration_abs':['median','max','mean','std'], \n", + " 't':['min','count'],\n", + " 'shiptype':lambda x:x.value_counts().index[0]}) \n", + "\n", + " traj_df.columns = [\"_\".join(x) for x in traj_df.columns.ravel()]\n", + " traj_df = traj_df.rename(columns={'t_count':'n', 'shiptype_':'shiptype', \n", + " 'diff_t_s_sum':'duration_s', 'dist_prev_m_sum':'length_m'})\n", + " traj_df['length_km'] = traj_df['length_m'] / 1000\n", + " traj_df['duration_h'] = traj_df['duration_s'] / 3600\n", + " traj_df['t_min_h'] = traj_df['t_min'].dt.hour + traj_df['t_min'].dt.minute / 60\n", + "\n", + " traj_df = traj_df[traj_df.n>=MINIMUM_NUMBER_OF_RECORDS]\n", + " traj_df = traj_df[traj_df['speed_m/s_median']>=MINIMUM_SPEED_MS]\n", + " return traj_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "try:\n", + " traj_df = pd.read_pickle('./traj.pkl')\n", + "except:\n", + " traj_df = compute_traj_info(segment_df)\n", + " traj_df.to_pickle(\"./traj.pkl\")\n", + " \n", + "traj_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hvplot.scatter_matrix(\n", + " traj_df[['length_km', 'speed_m/s_median', 'duration_h', 'acceleration_abs_mean', 'diff_dir_median']]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### C-3. Tracker heterogeneity\n", + "\n", + "\n", + "This step addresses the question whether the dataset contains records from devices with different tracking characteristics. Devices with GPS tracking capabilities vary widely in performance. For example, when data is collected using smartphone apps, coordinates may have passed through a variety of (not always fully transparent) preprocessing steps that depend on the operating system version and hardware manufacturer. \n", + "\n", + "Unexpected heterogeneity of tracker characteristics can affect the performance of analysis tasks. For example, if a pattern matcher was trained using high-frequency data with regular sampling intervals, it may not perform as expected when inputs change to irregular sampling intervals or lower frequency. If some trackers exhibit higher GPS noise than other trackers in the dataset, the higher-noise trajectories will suffer an overestimation of distance and derived measures (for more details on noise see step D-2).\n", + "\n", + "Effects of heterogeneous trackers can be hard to distinguish from effects of heterogeneous movers. Tracker heterogeneity may result in sampling rates and/or spatial accuracy that differ between movers. Furthermore, differences in the availability of additional attribute data within movement records may point towards tracker heterogeneity. \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "traj_df[(traj_df['diff_t_s_median']<=120) & (traj_df['speed_m/s_median']>0)] \\\n", + " .hvplot.scatter(\n", + " title='Median sampling interval over median speed', alpha=0.3,\n", + " x='diff_t_s_median', y='speed_m/s_median', hover_cols=['id','time'], \n", + " frame_width=FIGSIZE[1], frame_height=FIGSIZE[1], ylim=(-10,100))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## D. Accuracy problems \n", + "\n", + "Incorrect mover identities, coordinates, and timestamps can affect movement data analyses in a variety of ways. These problems usually cannot be detected from elementary position records. Therefore, intermediate segments or overall trajectories are needed. The following protocol steps target issues of mover identity, as well as spatial and temporal inaccuracy. \n", + "\n", + "\n", + "### D-1. Mover identity issues \n", + "\n", + "Reliable mover identifiers are needed to identify which movement data records belong to the same mover. Identity issues occur when IDs are not unique, i.e. if multiple movers are assigned the same identifier. A single mover may also be referred to by multiple different identifiers, either at the same time or due to changes over time. This can happen because the data collection system or (pre)processing workflow reassigns identifiers based on business rules or in regular time intervals. \n", + "\n", + "#### Non-unique IDs\n", + "\n", + "This step addresses the question whether the dataset contains cases of non-unique identifiers. Due to misconfiguration of trackers or (pre)processing errors, the same identifier may be assigned to multiple movers simultaneously (or to different movers over time which is covered by the unstable IDs step). \n", + "\n", + "Simultaneous non-unique IDs result in trajectories that connect location records by multiple movers traveling on their distinct paths. The resulting trajectory therefore jumps between locations along these different paths. Consequently, the trajectory assumes a zigzag shape and speeds derived from consecutive location records assume unrealistic values. Non-unique IDs can make it impossible to reliably distinguish affected movers. In some settings, it may be possible to salvage records if they include sufficient other information that can be used to infer identity, for example, mover names or mover properties (such as size, type or color).\n", + "\n", + "Scatter plots of trajectory length and direction change are useful to identify cases of non-unique IDs. Assuming that movers with identical IDs do not typically travel in close vicinity, potential candidates for non-unique IDs are characterized by long trajectories with high direction change values.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "traj_df.hvplot.scatter(\n", + " title='Trajectory length over direction difference (median)', alpha=0.3,\n", + " x='length_km', y='diff_dir_median', hover_cols=['id','time'], \n", + " frame_width=FIGSIZE[1], frame_height=250#, \n", + ") + traj_df.sort_values(by='length_km', ascending=False)[:10][['length_km', 'speed_m/s_median', 'diff_dir_median']].hvplot.table(\n", + " title='Top 10 trajectories - length', frame_width=FIGSIZE[1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_single_mover(segment_df, 1, date(2017,7,1)) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "traj_df = traj_df.drop(1, level='id')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Unstable IDs\n", + "\n", + "This step addresses the question whether mover identifiers are stable and for how long they remain stable. Some data sources do not provide permanently stable identifiers. Systems may reassign identifiers based on business rules or in regular time intervals. For example, taxi floating car systems may not include stable vehicle IDs, instead relying on trip IDs that are reassigned whenever a taxi finishes a trip. \n", + "\n", + "Unstable IDs can limit the analysis potential of the dataset. For example, if trip IDs keep changing and there is no stable mover ID, it becomes difficult to reliably determine mover statistics, such as the daily number of trips or the total distance moved. \n", + "\n", + "Scatter matrixes of trajectory duration versus start time (a combination of scatter plots and histograms) are useful to find out how often IDs change and whether they tend to change at the same time. For example, if IDs change daily and at the same time, the histograms will exhibit spikes at the 24 hour duration and corresponding time of day. In contrast, if IDs are stable over the whole observation period, the histograms will reflect the time spans during which individual movers were tracked. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hvplot.scatter_matrix(traj_df[['t_min_h', 'duration_h']])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### D-2. Spatial inaccuracy \n", + "\n", + "Coordinate errors range from basic noise due to the inherent inaccuracy of GPS to large jumps caused by technical errors or deliberate action. \n", + "\n", + "\n", + "#### Outliers with unrealistic jumps\n", + "\n", + "\n", + "This step addresses the question if trajectories contain large erroneous jumps that result in unrealistic derived speed values and require data cleaning. The limit for being unrealistic depends on the use case. For example, for ground-based transport, Fillekes et al. (2019) set the limit at 330 km/h based on the maximum speed of German high-speed trains. \n", + "\n", + "Large jumps affect derived segment measures, including direction, length, speed, and acceleration. Consequently they also affect trajectory measures which are aggregates of these segment measures. However, single jumps may not be immediately recognizable when looking at trajectory measures only.\n", + "\n", + "Histograms of derived speed between consecutive location records are useful to see if there is a long tail of high speed values. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "segment_df['speed_m/s'].hvplot.hist(\n", + " title='Histogram of speed between consecutive records', bins=100, frame_width=FIGSIZE[1], frame_height=250\n", + ") + segment_df.sort_values(by='speed_m/s', ascending=False)[:10][['id', 'speed_m/s']].hvplot.table(\n", + " title='Top 10 records - speed', frame_width=FIGSIZE[1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_single_mover(segment_df, 218057000, date(2018,1,1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "plot_single_mover(segment_df, 219348000, date(2017,7,1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Jitter / noise\n", + "\n", + "This step addresses the question of how noisy the trajectories are. GPS tracks are inherently noisy. However, in some cases, what appears as excessive noise can reflect real movement patterns. For example, vessel routes may have a zig-zag shape in case of adverse weather conditions. Noise also affects trajectories of movers that are standing still, appearing as fake jittery movement. \n", + "\n", + "Jitter or noise causes a systematic \"overestimation of distance\" when the sampling frequency is high. On the other hand, distances are underestimated when the sampling frequency is low. Without evaluating the sampling frequency, distance and derived speed values therefore are insufficient to understand noise. \n", + "\n", + "Scatter plots of direction change that compare median change and the observed standard deviation of change values can provide insights into the presence of excessive jitter or noise. However, high median values (approaching 180°) indicate out-of-sequence rather than jitter issues and will be discussed in the next step.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "traj_df.hvplot.scatter(\n", + " title='Direction difference median over standard deviation', alpha=0.3,\n", + " x='diff_dir_median', y='diff_dir_std', hover_cols=['id','time'], #datashade=True,\n", + " frame_width=FIGSIZE[1], frame_height=250, ylim=(-10,100)\n", + ") + traj_df.sort_values(by='diff_dir_median', ascending=False)[:10][['diff_dir_median','diff_dir_std']].hvplot.table(\n", + " title='Top 10 trajectories - direction difference', frame_width=FIGSIZE[1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_single_mover(segment_df, 244063000, date(2018,1,1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_single_mover(segment_df, 220614000, date(2018,1,1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### D-3. Temporal inaccuracy\n", + "\n", + "Timestamp errors potentially affect the synchronization between trajectories as well as the order of records within individual trajectories. \n", + "\n", + "#### Time zone and daylight saving issues\n", + "\n", + "This step addresses the question how time zones and daylight saving affect the dataset. In some datasets, time zone information may be included with each time stamp. However usually, this is not the case and analysts have to resort to metadata or documentation which are not always comprehensive or reliable. Time zone issues can be hard to detect, particularly if the dataset contains tracks from multiple time zones but the zone information got lost along the way. These issues may be discovered due to unexpected derived movement patterns, such as, for example, significant numbers of people leaving their homes in the middle of the night or excessive movement of nocturnal animals during the day. However, small shifts, such as missing daylight savings information, can be hard to distinguish from the normal variation. \n", + "\n", + "Unresolved time zone and daylight saving issues can have diverse side effects. For example, two tracks that appear to be moving together (collocated movement) may actually represent movement that happened at different points in time. Depending on the analysis area, a mix of local time and UTC (Coordinated Universal Time) could result in spikes of activity at unexpected times of the day. \n", + " \n", + "Temporal charts of record counts are helpful to detect gaps or double counting at the date and time when daylight saving goes into and out of effect. Two-dimensional time histograms of movement properties, such as speed, can help recognize time zone issues by revealing unusual temporal patterns. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tmp = segment_df[segment_df['speed_m/s']>1]\n", + "hourly = tmp['id'].groupby([tmp.index.hour, pd.Grouper(freq='d')]).count().to_frame(name='n')\n", + "hourly.rename_axis(['hour', 'day'], inplace=True)\n", + "hourly.hvplot.heatmap(title='Count of records with speed > 1m/s', x='hour', y='day', C='n', width=FIGSIZE[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Out-of-sequence positions\n", + "\n", + "This step addresses the question if records belonging to a trajectory appear out of sequence. A closely related problem is when a mover appears at two different locations at the same time. These problems can happen in systems that do not provide tracker timestamps and instead use receiver or storage time. For example, the Automatic Identification System (AIS) protocol does not transmit tracker timestamps and instead provides only offsets (in second) from the previously transmitted message which is insufficient to establish temporal order \"since positional updates from a single vessel may come from a series of base stations (those within range of its antenna along the route)\" (Patroumpas et al. 2017). \n", + "\n", + "Out-of-sequence positions affect many derived trajectory measures, including direction, length, speed, and acceleration. In affected trajectories, these measures will be severely overestimated. \n", + "\n", + "Scatter plots of direction change and speed are helpful to detect out-of-sequence problems. The sudden reversals of movement direction result in high direction change values (approaching 180°) and high speeds. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "traj_df.hvplot.scatter(\n", + " title='Direction difference (median) over speed (median)', alpha=0.3,\n", + " x='diff_dir_median', y='speed_m/s_median', hover_cols=['id','time'], #datashade=True,\n", + " frame_width=FIGSIZE[1], frame_height=250#, ylim=(-10,100)\n", + ") + traj_df.sort_values(by='diff_dir_median', ascending=False)[:10][['diff_dir_median','diff_dir_std','speed_m/s_median']].hvplot.table(\n", + " title='Top 10 trajectories - direction difference', frame_width=FIGSIZE[1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_single_mover(segment_df, 308322000, date(2017,7,1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_single_mover(segment_df, 265615040, date(2017,7,1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}