Skip to content

Commit

Permalink
- add arrow:schema support.
Browse files Browse the repository at this point in the history
- move metadata stuff to its own project to reduce package pollution on main project
  • Loading branch information
mukunku committed Aug 22, 2021
1 parent 0512e52 commit 3ea6769
Show file tree
Hide file tree
Showing 7 changed files with 159 additions and 78 deletions.
13 changes: 11 additions & 2 deletions src/ParquetFileViewer.sln
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2013
VisualStudioVersion = 12.0.40629.0
# Visual Studio Version 16
VisualStudioVersion = 16.0.29201.188
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ParquetFileViewer", "ParquetFileViewer\ParquetFileViewer.csproj", "{6019FC1B-3610-4682-BF96-8345C95CB7EC}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Utilities", "Utilities\Utilities.csproj", "{F423D115-06A0-47AF-A86E-2775E2F894F8}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand All @@ -15,8 +17,15 @@ Global
{6019FC1B-3610-4682-BF96-8345C95CB7EC}.Debug|Any CPU.Build.0 = Debug|Any CPU
{6019FC1B-3610-4682-BF96-8345C95CB7EC}.Release|Any CPU.ActiveCfg = Release|Any CPU
{6019FC1B-3610-4682-BF96-8345C95CB7EC}.Release|Any CPU.Build.0 = Release|Any CPU
{F423D115-06A0-47AF-A86E-2775E2F894F8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{F423D115-06A0-47AF-A86E-2775E2F894F8}.Debug|Any CPU.Build.0 = Debug|Any CPU
{F423D115-06A0-47AF-A86E-2775E2F894F8}.Release|Any CPU.ActiveCfg = Release|Any CPU
{F423D115-06A0-47AF-A86E-2775E2F894F8}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {24015CE0-473F-4A3B-89BE-E0CEEEA261B6}
EndGlobalSection
EndGlobal
34 changes: 1 addition & 33 deletions src/ParquetFileViewer/Helpers/ExtensionMethods.cs
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using System;
using System.Collections.Generic;
using System.Collections.Generic;
using System.Data;
using System.IO;

namespace ParquetFileViewer.Helpers
{
Expand All @@ -23,33 +19,5 @@ public static IList<string> GetColumnNames(this DataTable datatable)
}
return columns;
}

public static string FormatJSON(this string input)
{
if (input == null)
return null;

try
{
return JValue.Parse(input).ToString(Formatting.Indented);
}
catch (Exception)
{
//malformed json detected
return input;
}
}

public static string Base64Encode(this string plainText)
{
var plainTextBytes = System.Text.Encoding.UTF8.GetBytes(plainText);
return Convert.ToBase64String(plainTextBytes);
}

public static string Base64Decode(this string base64EncodedData)
{
var base64EncodedBytes = Convert.FromBase64String(base64EncodedData);
return System.Text.Encoding.UTF8.GetString(base64EncodedBytes);
}
}
}
45 changes: 6 additions & 39 deletions src/ParquetFileViewer/MetadataViewer.cs
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
using Parquet.Thrift;
using ParquetFileViewer.Helpers;
using System;
using System.Linq;
using System;
using System.Collections.Generic;
using System.Drawing;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using Utilities;

namespace ParquetFileViewer
{
Expand Down Expand Up @@ -58,34 +54,8 @@ private void MainBackgroundWorker_DoWork(object sender, System.ComponentModel.Do
var metadataResult = new List<(string TabName, string Text)>();
if (parquetReader.ThriftMetadata != null)
{
var thriftMetadata = parquetReader.ThriftMetadata;
var jsonObject = new Newtonsoft.Json.Linq.JObject();
jsonObject[nameof(thriftMetadata.Version)] = thriftMetadata.Version;
jsonObject[nameof(thriftMetadata.Num_rows)] = thriftMetadata.Num_rows;
jsonObject[nameof(thriftMetadata.Created_by)] = thriftMetadata.Created_by;

var schemas = new Newtonsoft.Json.Linq.JArray();
foreach (var schema in thriftMetadata.Schema)
{
if ("schema".Equals(schema.Name) && schemas.Count == 0)
continue;

var schemaObject = new Newtonsoft.Json.Linq.JObject();
schemaObject[nameof(schema.Field_id)] = schema.Field_id;
schemaObject[nameof(schema.Name)] = schema.Name;
schemaObject[nameof(schema.Type)] = schema.Type.ToString();
schemaObject[nameof(schema.Type_length)] = schema.Type_length;
schemaObject[nameof(schema.LogicalType)] = schema.LogicalType?.ToString();
schemaObject[nameof(schema.Scale)] = schema.Scale;
schemaObject[nameof(schema.Precision)] = schema.Precision;
schemaObject[nameof(schema.Repetition_type)] = schema.Repetition_type.ToString();
schemaObject[nameof(schema.Converted_type)] = schema.Converted_type.ToString();

schemas.Add(schemaObject);
}
jsonObject[nameof(thriftMetadata.Schema)] = schemas;

metadataResult.Add((THRIFT_METADATA, jsonObject.ToString().FormatJSON()));
string json = ParquetMetadataAnalyzers.ThriftMetadataToJSON(parquetReader.ThriftMetadata);
metadataResult.Add((THRIFT_METADATA, json));
}
else
metadataResult.Add((THRIFT_METADATA, "No thrift metadata available"));
Expand All @@ -97,14 +67,11 @@ private void MainBackgroundWorker_DoWork(object sender, System.ComponentModel.Do
string value = _customMetadata.Value;
if (PANDAS_SCHEMA.Equals(_customMetadata.Key))
{
value = value.FormatJSON();
value = ParquetMetadataAnalyzers.PandasSchemaToJSON(value);
}
else if (APACHE_ARROW_SCHEMA.Equals(_customMetadata.Key))
{
//TODO: Base64 decode on its own doesn't accomplish anything.
//Need some way to read the schema but there isn't anything in the apache arrow repo for this...
//https://github.com/apache/arrow/blob/master/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs
//value = value.Base64Decode();
value = ParquetMetadataAnalyzers.ApacheArrowToJSON(value);
}

metadataResult.Add((_customMetadata.Key, value));
Expand Down
10 changes: 7 additions & 3 deletions src/ParquetFileViewer/ParquetFileViewer.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
<Import Project="..\packages\Costura.Fody.4.1.0\build\Costura.Fody.props" Condition="Exists('..\packages\Costura.Fody.4.1.0\build\Costura.Fody.props')" />
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
<PropertyGroup>
<RestoreProjectStyle>PackageReference</RestoreProjectStyle>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProjectGuid>{6019FC1B-3610-4682-BF96-8345C95CB7EC}</ProjectGuid>
Expand Down Expand Up @@ -46,9 +47,6 @@
<Reference Include="IronSnappy, Version=1.3.0.0, Culture=neutral, PublicKeyToken=b1d4b1dc83bdcf31, processorArchitecture=MSIL">
<HintPath>..\packages\IronSnappy.1.3.0\lib\netstandard2.0\IronSnappy.dll</HintPath>
</Reference>
<Reference Include="Newtonsoft.Json, Version=13.0.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
<HintPath>..\packages\Newtonsoft.Json.13.0.1\lib\net45\Newtonsoft.Json.dll</HintPath>
</Reference>
<Reference Include="Parquet, Version=3.0.0.0, Culture=neutral, PublicKeyToken=d380b3dee6d01926, processorArchitecture=MSIL">
<HintPath>..\packages\Parquet.Net.3.8.6\lib\netstandard2.0\Parquet.dll</HintPath>
</Reference>
Expand Down Expand Up @@ -156,6 +154,12 @@
<None Include="Resources\coffee.gif" />
<Content Include="Resources\hourglass.gif" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\Utilities\Utilities.csproj">
<Project>{f423d115-06a0-47af-a86e-2775e2f894f8}</Project>
<Name>Utilities</Name>
</ProjectReference>
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
<PropertyGroup>
Expand Down
1 change: 0 additions & 1 deletion src/ParquetFileViewer/packages.config
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
<package id="Costura.Fody" version="4.1.0" targetFramework="net461" />
<package id="Fody" version="6.5.0" targetFramework="net461" developmentDependency="true" />
<package id="IronSnappy" version="1.3.0" targetFramework="net461" />
<package id="Newtonsoft.Json" version="13.0.1" targetFramework="net461" />
<package id="Parquet.Net" version="3.8.6" targetFramework="net461" />
<package id="System.Buffers" version="4.5.1" targetFramework="net461" />
<package id="System.Memory" version="4.5.4" targetFramework="net461" />
Expand Down
121 changes: 121 additions & 0 deletions src/Utilities/ParquetMetadataAnalyzers.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
using Apache.Arrow.Ipc;
using Apache.Arrow.Types;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using Parquet.Thrift;
using System;

namespace Utilities
{
public static class ParquetMetadataAnalyzers
{
public static string ApacheArrowToJSON(string base64)
{
try
{
byte[] bytes = Convert.FromBase64String(base64);
using (ArrowStreamReader reader = new ArrowStreamReader(bytes))
{
reader.ReadNextRecordBatch();
return JsonConvert.SerializeObject(reader.Schema, Formatting.Indented);

var metadata = new JObject();
var schema = new JObject();

var fields = new JArray();
if (reader.Schema?.Fields != null)
{
foreach (var _field in reader.Schema.Fields)
{
var field = new JObject();
field[nameof(_field.Value.Name)] = _field.Value.Name;
field[nameof(_field.Value.IsNullable)] = _field.Value.IsNullable;
field[nameof(_field.Value.DataType)] = JObject.Parse(JsonConvert.SerializeObject(_field.Value.DataType));

if (_field.Value.HasMetadata)
{
metadata = new JObject();
foreach (var _fieldMetadata in _field.Value.Metadata)
{
metadata[_fieldMetadata.Key] = _fieldMetadata.Value;
}
field[nameof(metadata)] = metadata;
}

fields.Add(field);
}
}
schema[nameof(fields)] = fields;

metadata = new JObject();
if (reader.Schema?.Metadata != null)
{
foreach (var _metadata in reader.Schema.Metadata)
{
metadata[_metadata.Key] = _metadata.Value;
}
}
schema[nameof(metadata)] = metadata;

return schema.ToString(Formatting.Indented);
}
}
catch (Exception ex)
{
return $"Something went wrong while processing the schema:{Environment.NewLine}{Environment.NewLine}{ex.ToString()}";
}
}

public static string ThriftMetadataToJSON(FileMetaData thriftMetadata)
{
try
{
var jsonObject = new JObject();
jsonObject[nameof(thriftMetadata.Version)] = thriftMetadata.Version;
jsonObject[nameof(thriftMetadata.Num_rows)] = thriftMetadata.Num_rows;
jsonObject[nameof(thriftMetadata.Created_by)] = thriftMetadata.Created_by;

var schemas = new JArray();
foreach (var schema in thriftMetadata.Schema)
{
if ("schema".Equals(schema.Name) && schemas.Count == 0)
continue;

var schemaObject = new JObject();
schemaObject[nameof(schema.Field_id)] = schema.Field_id;
schemaObject[nameof(schema.Name)] = schema.Name;
schemaObject[nameof(schema.Type)] = schema.Type.ToString();
schemaObject[nameof(schema.Type_length)] = schema.Type_length;
schemaObject[nameof(schema.LogicalType)] = schema.LogicalType?.ToString();
schemaObject[nameof(schema.Scale)] = schema.Scale;
schemaObject[nameof(schema.Precision)] = schema.Precision;
schemaObject[nameof(schema.Repetition_type)] = schema.Repetition_type.ToString();
schemaObject[nameof(schema.Converted_type)] = schema.Converted_type.ToString();

schemas.Add(schemaObject);
}
jsonObject[nameof(thriftMetadata.Schema)] = schemas;

return jsonObject.ToString(Formatting.Indented);
}
catch (Exception ex)
{
return $"Something went wrong while processing the schema:{Environment.NewLine}{Environment.NewLine}{ex.ToString()}";
}
}

public static string PandasSchemaToJSON(string pandas)
{
try
{
//Pandas is already json; so just make it pretty.
return JValue.Parse(pandas).ToString(Formatting.Indented);
}
catch (Exception)
{
//malformed json detected
return pandas;
}
}
}
}
13 changes: 13 additions & 0 deletions src/Utilities/Utilities.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>netstandard2.0</TargetFramework>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Apache.Arrow" Version="5.0.0" />
<PackageReference Include="Newtonsoft.Json" Version="13.0.1" />
<PackageReference Include="Parquet.Net" Version="3.8.6" />
</ItemGroup>

</Project>

0 comments on commit 3ea6769

Please sign in to comment.