Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes #13

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 17 additions & 4 deletions SalarySchedules.App/Program.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using System;
using System.IO;
using System.Linq;
using Newtonsoft.Json;
using SalarySchedules.Models;
using SalarySchedules.Parser;
Expand All @@ -10,24 +11,36 @@ class Program
{
static void Main(string[] args)
{
if (args.Length < 1)
var files = args.Where(a => a.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase));
bool pretty = args.Contains("/p");

if (!files.Any())
{
Console.WriteLine("USAGE:");
Console.WriteLine("\tSalarySchedules [OPTIONS] file1.pdf [file2.pdf file3.pdf fileN.pdf]");
Console.WriteLine();
Console.WriteLine("\tOPTIONS:");
Console.WriteLine("\t\t /p Pretty print JSON");
Console.WriteLine();
return;
}

ISalaryScheduleParser parser = new CSMSalaryScheduleParser();

foreach (var file in args)
foreach (var file in files)
{
try
{
Console.Write("Processing file {0}... ", file);
ISalarySchedule schedule = parser.Process(file);
string json = JsonConvert.SerializeObject(schedule);
string json = JsonConvert.SerializeObject(schedule, pretty ? Formatting.Indented : Formatting.None);
File.WriteAllText(file.Replace(".pdf", ".json"), json);
Console.WriteLine("Finished");
}
catch
catch (Exception ex)
{
Console.WriteLine("Error processing file {0}", file);
Console.Write(ex);
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion SalarySchedules.App/SalarySchedules.App.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<OutputType>Exe</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>SalarySchedules.App</RootNamespace>
<AssemblyName>SalarySchedules.App</AssemblyName>
<AssemblyName>SalarySchedules</AssemblyName>
<TargetFrameworkVersion>v4.5.1</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<TargetFrameworkProfile />
Expand Down
89 changes: 51 additions & 38 deletions SalarySchedules.Parser/CSMSalaryScheduleParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,14 @@ public IEnumerable<IEnumerable<string>> GetAlignmentCorrectedClassData(string fi
/// </summary>
private IEnumerable<IEnumerable<string>> getAlignmentCorrectedClassData(PdfReader reader)
{
return
Enumerable.Range(2, reader.NumberOfPages - 1)
.Select(n => reader.TextFromPage(n))
.Select(page => getPageChunks(page))
.Select(chunks => fixAlignment(chunks))
.ToArray();
for (int n = 2; n <= reader.NumberOfPages; n++)
{
var text = reader.TextFromPage(n);
var chunks = getPageChunks(text);
var fixedChunks = fixAlignment(chunks);

yield return fixedChunks;
}
}

/// <summary>
Expand All @@ -80,8 +82,8 @@ IEnumerable<string> getPageChunks(string page)
IEnumerable<string> chunks = page.Split(new[] { "\n" }, StringSplitOptions.RemoveEmptyEntries);

//skip the header rows when starting a new page
if (FieldPatterns.RunDate.IsMatch(chunks.First()))
chunks = chunks.SkipWhile(s => !FieldPatterns.DataHeader.IsMatch(s)).Skip(1);
if (Patterns.RunDate.IsMatch(chunks.First()))
chunks = chunks.SkipWhile(s => !Patterns.DataHeader.IsMatch(s)).Skip(1);

return chunks;
}
Expand Down Expand Up @@ -113,28 +115,33 @@ IEnumerable<string> fixAlignment(IEnumerable<string> chunks)
{
string replace = " ";
List<string> final = new List<string>();
Queue<string> queue = new Queue<string>(chunks);

var dataChunks = chunks.Any(c => Patterns.DataHeader.IsMatch(c))
? chunks.SkipWhile(c => !Patterns.DataHeader.IsMatch(c)).Skip(1)
: chunks;

Queue<string> queue = new Queue<string>(dataChunks);

while (queue.Any())
{
string current = FieldPatterns.ConsecutiveSpaces.Replace(queue.Dequeue(), replace).Trim();
string current = Patterns.ConsecutiveSpaces.Replace(queue.Dequeue(), replace).Trim();

if (queue.Any())
{
string next = queue.Peek().Trim();

if (FieldPatterns.ClassCode.IsMatch(current) && FieldPatterns.ClassTitle.IsMatch(next))
if (Patterns.ClassCode.IsMatch(current) && Patterns.ClassTitle.IsMatch(next))
{
current += " " + FieldPatterns.ConsecutiveSpaces.Replace(queue.Dequeue(), replace).Trim();
current += " " + Patterns.ConsecutiveSpaces.Replace(queue.Dequeue(), replace).Trim();
}
else if (FieldPatterns.Grade.IsMatch(current) && FieldPatterns.Rate.IsMatch(next)
&& !FieldPatterns.Grade.IsMatch(next) && !FieldPatterns.ClassCode.IsMatch(next))
else if (Patterns.Grade.IsMatch(current) && Patterns.Rate.IsMatch(next)
&& !Patterns.Grade.IsMatch(next) && !Patterns.ClassCode.IsMatch(next))
{
current += " " + FieldPatterns.ConsecutiveSpaces.Replace(queue.Dequeue(), replace).Trim();
current += " " + Patterns.ConsecutiveSpaces.Replace(queue.Dequeue(), replace).Trim();
}
}

final.Add(current.Replace(" -", "-").Replace("- ", "-"));
final.Add(Patterns.DashAndSpace.Replace(current, "-"));
}

return final;
Expand All @@ -150,9 +157,9 @@ FiscalYear readFiscalYear(PdfReader reader)

var text = reader.TextFromPage(1);

if (FieldPatterns.FiscalYear.IsMatch(text))
if (Patterns.FiscalYear.IsMatch(text))
{
var match = FieldPatterns.FiscalYear.Match(text);
var match = Patterns.FiscalYear.Match(text);
fiscalYear = new FiscalYear(match.Groups[1].Value, match.Groups[2].Value);
}

Expand All @@ -169,9 +176,14 @@ FiscalYear readFiscalYear(PdfReader reader)

var text = reader.TextFromPage(2);

if (FieldPatterns.RunDate.IsMatch(text))
if (Patterns.RunDate.IsMatch(text))
{
DateTime.TryParse(FieldPatterns.RunDate.Match(text).Groups[1].Value.Trim(), out reportDate);
var match = Patterns.RunDate.Match(text);
var group = String.IsNullOrEmpty(match.Groups[1].Value) && match.Groups.Count > 2
? match.Groups[2]
: match.Groups[1];

DateTime.TryParse(group.Value.Trim(), out reportDate);
}

if (reportDate != DateTime.MinValue)
Expand All @@ -191,16 +203,16 @@ IEnumerable<BargainingUnit> readBargainingUnits(PdfReader reader)
//the Fire BU is never output as a code in the BU table?
var text = reader.TextFromPage(1).Replace("Fire ", "FIR ");
//get all the BU chunks
var chunks = getPageChunks(text).Where(c => FieldPatterns.BargainingUnit.IsMatch(c));
var chunks = getPageChunks(text).Where(c => Patterns.BargainingUnit.IsMatch(c));

foreach (var chunk in chunks)
{
//get each of the BUs in this chunk
var matches = FieldPatterns.BargainingUnit.Matches(chunk);
var matches = Patterns.BargainingUnit.Matches(chunk);
//split the chunk at the BU code points (leaving their names)
var names = FieldPatterns.BargainingUnit.Split(chunk)
var names = Patterns.BargainingUnit.Split(chunk)
.Where(s => !String.IsNullOrEmpty(s.Trim()))
.Select(s => FieldPatterns.ConsecutiveSpaces.Replace(s, replace).Trim());
.Select(s => Patterns.ConsecutiveSpaces.Replace(s, replace).Trim());
for (int i = 0; i < matches.Count; i++)
{
var match = matches[i];
Expand All @@ -224,7 +236,7 @@ IEnumerable<JobClass> processClassesOnPage(IEnumerable<string> page)
{
var jobClasses = new List<JobClass>();

//to process the chunks sequentially (considering more than one at a time)
//process chunks sequentially, considering more than one at a time
var queue = new Queue<string>(page);

while (queue.Any())
Expand All @@ -233,23 +245,24 @@ IEnumerable<JobClass> processClassesOnPage(IEnumerable<string> page)
var steps = new List<JobClassStep>();
var currentStep = new JobClassStep();

IEnumerable<string> dataChunks = queue.Dequeue().Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries);
var dataChunks = queue.Dequeue().Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries);

//assign the title, code, bargaining unit, grade for this class
dataChunks = assignClassData(dataChunks.ToList(), jobClass);

//return any remaining chunks
var remainingChunks = assignClassData(dataChunks.ToList(), jobClass);

//if there is leftover data -> step definition
if (dataChunks.Any())
if (remainingChunks.Any())
{
currentStep = assignStepData(dataChunks);
currentStep = assignStepData(remainingChunks);
steps.Add(currentStep);
}

//add each subsequent step for this class
while (queue.Any() && queue.Peek().StartsWith(jobClass.Grade))
while (queue.Any() && Patterns.StartsWithWord(jobClass.Grade).IsMatch(queue.Peek()))
{
dataChunks = queue.Dequeue().Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries).Skip(1);
currentStep = assignStepData(dataChunks);
remainingChunks = queue.Dequeue().Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries).Skip(1);
currentStep = assignStepData(remainingChunks);
steps.Add(currentStep);
}

Expand All @@ -272,21 +285,21 @@ IEnumerable<string> assignClassData(IList<string> dataChunks, JobClass jobClass)
//which may overlap one another

// 1. code is always 4 consecutive integers
string code = dataChunks.FirstOrDefault(c => FieldPatterns.ClassCode.IsMatch(c));
string code = dataChunks.FirstOrDefault(c => Patterns.ClassCode.IsMatch(c));
if (!String.IsNullOrEmpty(code))
{
jobClass.Code = code;
dataChunks.Remove(code);
}
// 2. grade is always 3 consecutive integers
string grade = dataChunks.FirstOrDefault(c => FieldPatterns.Grade.IsMatch(c));
string grade = dataChunks.FirstOrDefault(c => Patterns.Grade.IsMatch(c));
if (!String.IsNullOrEmpty(grade))
{
jobClass.Grade = grade;
dataChunks.Remove(grade);
}
// 3. bargaining unit code is always 3 consecutive capital letters
string bu = dataChunks.FirstOrDefault(c => FieldPatterns.BargainingUnit.IsMatch(c));
string bu = dataChunks.FirstOrDefault(c => Patterns.BargainingUnit.IsMatch(c));
if (!String.IsNullOrEmpty(bu))
{
jobClass.BargainingUnit =
Expand Down Expand Up @@ -328,7 +341,7 @@ JobClassStep assignStepData(IEnumerable<string> dataChunks)
{
//convert to numeric and order increasing
var numberChunks = dataChunks.Select(d => decimal.Parse(d)).OrderBy(d => d).ToArray();
step.StepNumber = (int)numberChunks[0];
step.StepNumber = (int)numberChunks[0];
step.HourlyRate = numberChunks[1];
step.BiWeeklyRate = numberChunks[2];
step.MonthlyRate = numberChunks[3];
Expand All @@ -338,7 +351,7 @@ JobClassStep assignStepData(IEnumerable<string> dataChunks)
{
throw new InvalidOperationException(String.Format("Couldn't parse step data: {0}", String.Join(" ", dataChunks)));
}

return step;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,44 @@

namespace SalarySchedules.Parser
{
static class FieldPatterns
static class Patterns
{
public static Regex FiscalYear =
new Regex(@"fiscal\s+year\s+(\d{2})/(\d{2})", RegexOptions.Compiled | RegexOptions.IgnoreCase);

public static Regex RunDate =
new Regex(@"run date: (\d{1,2}/\d{1,2}/\d{2,4})", RegexOptions.Compiled | RegexOptions.IgnoreCase);

public static Regex DataHeader =
new Regex(@"Class Title Class Code BU Grade Step Hourly Rate Monthly Rate Annual Rate Bi-Weekly Rate", RegexOptions.Compiled);
public static Regex BargainingUnit =
new Regex(@"((?<= )|(?<!.))[A-Z]{3}((?= )|(?!.))", RegexOptions.Compiled);

public static Regex ClassTitle =
new Regex(@"[a-z\(]+[\./&\) -]*", RegexOptions.Compiled | RegexOptions.IgnoreCase);

public static Regex ClassCode =
new Regex(@"((?<= )|(?<!.))[0-9]{4}((?= )|(?!.))", RegexOptions.Compiled);

public static Regex BargainingUnit =
new Regex(@"((?<= )|(?<!.))[A-Z]{3}((?= )|(?!.))", RegexOptions.Compiled);
public static Regex ConsecutiveSpaces =
new Regex(@"\s{2,}", RegexOptions.Compiled);

public static Regex DashAndSpace =
new Regex(@"\s*-\s*", RegexOptions.Compiled);

public static Regex DataHeader =
new Regex(@"Class Title Class Code BU Grade Step Hourly Rate Monthly Rate Annual Rate Bi-Weekly Rate", RegexOptions.Compiled);

public static Regex FiscalYear =
new Regex(@"fiscal\s+year\s+(\d{2})/(\d{2})", RegexOptions.Compiled | RegexOptions.IgnoreCase);

public static Regex Grade =
new Regex(@"((?<= )|(?<!.))[0-9]{3}((?= )|(?!.))", RegexOptions.Compiled);

public static Regex Rate =
new Regex(@"((?<= )|(?<!.))[0-9,]{1,}\.\d{2}((?= )|(?!.))", RegexOptions.Compiled);
new Regex(@"((?<= )|(?<!.))[0-9,]{1,}\.\d{2}((?= )|(?!.))", RegexOptions.Compiled);

public static Regex RunDate =
new Regex(@"run date: (\d{1,2}/\d{1,2}/\d{2,4})|(\d{1,2}/\d{1,2}/\d{2,4})\srun date:", RegexOptions.Compiled | RegexOptions.IgnoreCase);

public static Regex StartsWithWord(string word)
{
return new Regex("^" + word + "\\s");
}

public static Regex Step =
new Regex(@"((?<= )|(?<!.))[1-5]((?= )|(?!.))", RegexOptions.Compiled);

public static Regex ConsecutiveSpaces = new Regex(@"\s{2,}");
}
}
2 changes: 1 addition & 1 deletion SalarySchedules.Parser/SalarySchedules.Parser.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
</ItemGroup>
<ItemGroup>
<Compile Include="Extensions.cs" />
<Compile Include="FieldPatterns.cs" />
<Compile Include="Patterns.cs" />
<Compile Include="ISalaryScheduleParser.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="CSMSalaryScheduleParser.cs" />
Expand Down