From 3ea67696688f9b463ccd563fb24761ef4666e4d6 Mon Sep 17 00:00:00 2001 From: Sal Date: Sun, 22 Aug 2021 08:49:19 -0400 Subject: [PATCH] - add arrow:schema support. - move metadata stuff to its own project to reduce package pollution on main project --- src/ParquetFileViewer.sln | 13 +- .../Helpers/ExtensionMethods.cs | 34 +---- src/ParquetFileViewer/MetadataViewer.cs | 45 +------ .../ParquetFileViewer.csproj | 10 +- src/ParquetFileViewer/packages.config | 1 - src/Utilities/ParquetMetadataAnalyzers.cs | 121 ++++++++++++++++++ src/Utilities/Utilities.csproj | 13 ++ 7 files changed, 159 insertions(+), 78 deletions(-) create mode 100644 src/Utilities/ParquetMetadataAnalyzers.cs create mode 100644 src/Utilities/Utilities.csproj diff --git a/src/ParquetFileViewer.sln b/src/ParquetFileViewer.sln index 395c615..0bd5359 100644 --- a/src/ParquetFileViewer.sln +++ b/src/ParquetFileViewer.sln @@ -1,10 +1,12 @@  Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 2013 -VisualStudioVersion = 12.0.40629.0 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.29201.188 MinimumVisualStudioVersion = 10.0.40219.1 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ParquetFileViewer", "ParquetFileViewer\ParquetFileViewer.csproj", "{6019FC1B-3610-4682-BF96-8345C95CB7EC}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Utilities", "Utilities\Utilities.csproj", "{F423D115-06A0-47AF-A86E-2775E2F894F8}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -15,8 +17,15 @@ Global {6019FC1B-3610-4682-BF96-8345C95CB7EC}.Debug|Any CPU.Build.0 = Debug|Any CPU {6019FC1B-3610-4682-BF96-8345C95CB7EC}.Release|Any CPU.ActiveCfg = Release|Any CPU {6019FC1B-3610-4682-BF96-8345C95CB7EC}.Release|Any CPU.Build.0 = Release|Any CPU + {F423D115-06A0-47AF-A86E-2775E2F894F8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {F423D115-06A0-47AF-A86E-2775E2F894F8}.Debug|Any CPU.Build.0 = Debug|Any CPU + {F423D115-06A0-47AF-A86E-2775E2F894F8}.Release|Any CPU.ActiveCfg = Release|Any CPU + {F423D115-06A0-47AF-A86E-2775E2F894F8}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {24015CE0-473F-4A3B-89BE-E0CEEEA261B6} + EndGlobalSection EndGlobal diff --git a/src/ParquetFileViewer/Helpers/ExtensionMethods.cs b/src/ParquetFileViewer/Helpers/ExtensionMethods.cs index b49add7..c1f1880 100644 --- a/src/ParquetFileViewer/Helpers/ExtensionMethods.cs +++ b/src/ParquetFileViewer/Helpers/ExtensionMethods.cs @@ -1,9 +1,5 @@ -using Newtonsoft.Json; -using Newtonsoft.Json.Linq; -using System; -using System.Collections.Generic; +using System.Collections.Generic; using System.Data; -using System.IO; namespace ParquetFileViewer.Helpers { @@ -23,33 +19,5 @@ public static IList GetColumnNames(this DataTable datatable) } return columns; } - - public static string FormatJSON(this string input) - { - if (input == null) - return null; - - try - { - return JValue.Parse(input).ToString(Formatting.Indented); - } - catch (Exception) - { - //malformed json detected - return input; - } - } - - public static string Base64Encode(this string plainText) - { - var plainTextBytes = System.Text.Encoding.UTF8.GetBytes(plainText); - return Convert.ToBase64String(plainTextBytes); - } - - public static string Base64Decode(this string base64EncodedData) - { - var base64EncodedBytes = Convert.FromBase64String(base64EncodedData); - return System.Text.Encoding.UTF8.GetString(base64EncodedBytes); - } } } diff --git a/src/ParquetFileViewer/MetadataViewer.cs b/src/ParquetFileViewer/MetadataViewer.cs index 91cf78a..509af6f 100644 --- a/src/ParquetFileViewer/MetadataViewer.cs +++ b/src/ParquetFileViewer/MetadataViewer.cs @@ -1,12 +1,8 @@ -using Parquet.Thrift; -using ParquetFileViewer.Helpers; -using System; -using System.Linq; +using System; using System.Collections.Generic; using System.Drawing; -using System.Text; -using System.Threading.Tasks; using System.Windows.Forms; +using Utilities; namespace ParquetFileViewer { @@ -58,34 +54,8 @@ private void MainBackgroundWorker_DoWork(object sender, System.ComponentModel.Do var metadataResult = new List<(string TabName, string Text)>(); if (parquetReader.ThriftMetadata != null) { - var thriftMetadata = parquetReader.ThriftMetadata; - var jsonObject = new Newtonsoft.Json.Linq.JObject(); - jsonObject[nameof(thriftMetadata.Version)] = thriftMetadata.Version; - jsonObject[nameof(thriftMetadata.Num_rows)] = thriftMetadata.Num_rows; - jsonObject[nameof(thriftMetadata.Created_by)] = thriftMetadata.Created_by; - - var schemas = new Newtonsoft.Json.Linq.JArray(); - foreach (var schema in thriftMetadata.Schema) - { - if ("schema".Equals(schema.Name) && schemas.Count == 0) - continue; - - var schemaObject = new Newtonsoft.Json.Linq.JObject(); - schemaObject[nameof(schema.Field_id)] = schema.Field_id; - schemaObject[nameof(schema.Name)] = schema.Name; - schemaObject[nameof(schema.Type)] = schema.Type.ToString(); - schemaObject[nameof(schema.Type_length)] = schema.Type_length; - schemaObject[nameof(schema.LogicalType)] = schema.LogicalType?.ToString(); - schemaObject[nameof(schema.Scale)] = schema.Scale; - schemaObject[nameof(schema.Precision)] = schema.Precision; - schemaObject[nameof(schema.Repetition_type)] = schema.Repetition_type.ToString(); - schemaObject[nameof(schema.Converted_type)] = schema.Converted_type.ToString(); - - schemas.Add(schemaObject); - } - jsonObject[nameof(thriftMetadata.Schema)] = schemas; - - metadataResult.Add((THRIFT_METADATA, jsonObject.ToString().FormatJSON())); + string json = ParquetMetadataAnalyzers.ThriftMetadataToJSON(parquetReader.ThriftMetadata); + metadataResult.Add((THRIFT_METADATA, json)); } else metadataResult.Add((THRIFT_METADATA, "No thrift metadata available")); @@ -97,14 +67,11 @@ private void MainBackgroundWorker_DoWork(object sender, System.ComponentModel.Do string value = _customMetadata.Value; if (PANDAS_SCHEMA.Equals(_customMetadata.Key)) { - value = value.FormatJSON(); + value = ParquetMetadataAnalyzers.PandasSchemaToJSON(value); } else if (APACHE_ARROW_SCHEMA.Equals(_customMetadata.Key)) { - //TODO: Base64 decode on its own doesn't accomplish anything. - //Need some way to read the schema but there isn't anything in the apache arrow repo for this... - //https://github.com/apache/arrow/blob/master/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs - //value = value.Base64Decode(); + value = ParquetMetadataAnalyzers.ApacheArrowToJSON(value); } metadataResult.Add((_customMetadata.Key, value)); diff --git a/src/ParquetFileViewer/ParquetFileViewer.csproj b/src/ParquetFileViewer/ParquetFileViewer.csproj index d71c1d7..cc0ee6d 100644 --- a/src/ParquetFileViewer/ParquetFileViewer.csproj +++ b/src/ParquetFileViewer/ParquetFileViewer.csproj @@ -3,6 +3,7 @@ + PackageReference Debug AnyCPU {6019FC1B-3610-4682-BF96-8345C95CB7EC} @@ -46,9 +47,6 @@ ..\packages\IronSnappy.1.3.0\lib\netstandard2.0\IronSnappy.dll - - ..\packages\Newtonsoft.Json.13.0.1\lib\net45\Newtonsoft.Json.dll - ..\packages\Parquet.Net.3.8.6\lib\netstandard2.0\Parquet.dll @@ -156,6 +154,12 @@ + + + {f423d115-06a0-47af-a86e-2775e2f894f8} + Utilities + + diff --git a/src/ParquetFileViewer/packages.config b/src/ParquetFileViewer/packages.config index b3f6259..6d94d8c 100644 --- a/src/ParquetFileViewer/packages.config +++ b/src/ParquetFileViewer/packages.config @@ -3,7 +3,6 @@ - diff --git a/src/Utilities/ParquetMetadataAnalyzers.cs b/src/Utilities/ParquetMetadataAnalyzers.cs new file mode 100644 index 0000000..c8db35c --- /dev/null +++ b/src/Utilities/ParquetMetadataAnalyzers.cs @@ -0,0 +1,121 @@ +using Apache.Arrow.Ipc; +using Apache.Arrow.Types; +using Newtonsoft.Json; +using Newtonsoft.Json.Linq; +using Parquet.Thrift; +using System; + +namespace Utilities +{ + public static class ParquetMetadataAnalyzers + { + public static string ApacheArrowToJSON(string base64) + { + try + { + byte[] bytes = Convert.FromBase64String(base64); + using (ArrowStreamReader reader = new ArrowStreamReader(bytes)) + { + reader.ReadNextRecordBatch(); + return JsonConvert.SerializeObject(reader.Schema, Formatting.Indented); + + var metadata = new JObject(); + var schema = new JObject(); + + var fields = new JArray(); + if (reader.Schema?.Fields != null) + { + foreach (var _field in reader.Schema.Fields) + { + var field = new JObject(); + field[nameof(_field.Value.Name)] = _field.Value.Name; + field[nameof(_field.Value.IsNullable)] = _field.Value.IsNullable; + field[nameof(_field.Value.DataType)] = JObject.Parse(JsonConvert.SerializeObject(_field.Value.DataType)); + + if (_field.Value.HasMetadata) + { + metadata = new JObject(); + foreach (var _fieldMetadata in _field.Value.Metadata) + { + metadata[_fieldMetadata.Key] = _fieldMetadata.Value; + } + field[nameof(metadata)] = metadata; + } + + fields.Add(field); + } + } + schema[nameof(fields)] = fields; + + metadata = new JObject(); + if (reader.Schema?.Metadata != null) + { + foreach (var _metadata in reader.Schema.Metadata) + { + metadata[_metadata.Key] = _metadata.Value; + } + } + schema[nameof(metadata)] = metadata; + + return schema.ToString(Formatting.Indented); + } + } + catch (Exception ex) + { + return $"Something went wrong while processing the schema:{Environment.NewLine}{Environment.NewLine}{ex.ToString()}"; + } + } + + public static string ThriftMetadataToJSON(FileMetaData thriftMetadata) + { + try + { + var jsonObject = new JObject(); + jsonObject[nameof(thriftMetadata.Version)] = thriftMetadata.Version; + jsonObject[nameof(thriftMetadata.Num_rows)] = thriftMetadata.Num_rows; + jsonObject[nameof(thriftMetadata.Created_by)] = thriftMetadata.Created_by; + + var schemas = new JArray(); + foreach (var schema in thriftMetadata.Schema) + { + if ("schema".Equals(schema.Name) && schemas.Count == 0) + continue; + + var schemaObject = new JObject(); + schemaObject[nameof(schema.Field_id)] = schema.Field_id; + schemaObject[nameof(schema.Name)] = schema.Name; + schemaObject[nameof(schema.Type)] = schema.Type.ToString(); + schemaObject[nameof(schema.Type_length)] = schema.Type_length; + schemaObject[nameof(schema.LogicalType)] = schema.LogicalType?.ToString(); + schemaObject[nameof(schema.Scale)] = schema.Scale; + schemaObject[nameof(schema.Precision)] = schema.Precision; + schemaObject[nameof(schema.Repetition_type)] = schema.Repetition_type.ToString(); + schemaObject[nameof(schema.Converted_type)] = schema.Converted_type.ToString(); + + schemas.Add(schemaObject); + } + jsonObject[nameof(thriftMetadata.Schema)] = schemas; + + return jsonObject.ToString(Formatting.Indented); + } + catch (Exception ex) + { + return $"Something went wrong while processing the schema:{Environment.NewLine}{Environment.NewLine}{ex.ToString()}"; + } + } + + public static string PandasSchemaToJSON(string pandas) + { + try + { + //Pandas is already json; so just make it pretty. + return JValue.Parse(pandas).ToString(Formatting.Indented); + } + catch (Exception) + { + //malformed json detected + return pandas; + } + } + } +} diff --git a/src/Utilities/Utilities.csproj b/src/Utilities/Utilities.csproj new file mode 100644 index 0000000..dedee66 --- /dev/null +++ b/src/Utilities/Utilities.csproj @@ -0,0 +1,13 @@ + + + + netstandard2.0 + + + + + + + + +