From 5eb45b62da0805be975a8a36f4812b61c8cf8c12 Mon Sep 17 00:00:00 2001 From: onozaty Date: Sun, 9 Jan 2022 19:53:28 +0900 Subject: [PATCH 1/5] =?UTF-8?q?[gcount]=20gcount=E3=81=AE=E5=AE=9F?= =?UTF-8?q?=E8=A3=85(=E3=83=86=E3=82=B9=E3=83=88=E3=81=AF=E3=81=BE?= =?UTF-8?q?=E3=81=A0)=20#19?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmd/gcount.go | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++ cmd/root.go | 1 + 2 files changed, 123 insertions(+) create mode 100644 cmd/gcount.go diff --git a/cmd/gcount.go b/cmd/gcount.go new file mode 100644 index 0000000..6a7987d --- /dev/null +++ b/cmd/gcount.go @@ -0,0 +1,122 @@ +package cmd + +import ( + "io" + _sort "sort" + "strconv" + + "github.com/onozaty/csvt/csv" + "github.com/pkg/errors" + "github.com/spf13/cobra" +) + +func newGcountCmd() *cobra.Command { + + gcountCmd := &cobra.Command{ + Use: "gcount", + Short: "Count the number of records in each group", + RunE: func(cmd *cobra.Command, args []string) error { + + format, err := getFlagBaseCsvFormat(cmd.Flags()) + if err != nil { + return err + } + + inputPath, _ := cmd.Flags().GetString("input") + targetColumnName, _ := cmd.Flags().GetString("column") + outputPath, _ := cmd.Flags().GetString("output") + countColumnName, _ := cmd.Flags().GetString("count-column") + + // 引数の解析に成功した時点で、エラーが起きてもUsageは表示しない + cmd.SilenceUsage = true + + err = runGroupCount( + format, + inputPath, + targetColumnName, + countColumnName, + outputPath) + + if err != nil { + return err + } + + cmd.Printf("%d\n", count) + + return nil + }, + } + + gcountCmd.Flags().StringP("input", "i", "", "Input CSV file path.") + gcountCmd.MarkFlagRequired("input") + gcountCmd.Flags().StringP("column", "c", "", "Name of the column to use for grouping.") + gcountCmd.Flags().StringP("count-column", "", "COUNT", "(optional) Column name for the number of records.") + gcountCmd.Flags().StringP("output", "o", "", "Output CSV file path.") + gcountCmd.MarkFlagRequired("output") + + return gcountCmd +} + +func runGroupCount(format csv.Format, inputPath string, targetColumnName string, countColumnName string, outputPath string) error { + + reader, writer, close, err := setupInputOutput(inputPath, outputPath, format) + if err != nil { + return err + } + defer close() + + err = groupCount(reader, targetColumnName, countColumnName, writer) + if err != nil { + return err + } + + return writer.Flush() +} + +func groupCount(reader csv.CsvReader, targetColumnName string, countColumnName string, writer csv.CsvWriter) error { + + // ヘッダ + columnNames, err := reader.Read() + if err != nil { + return errors.Wrap(err, "failed to read the CSV file") + } + + targetColumnIndex, err := getTargetColumnIndex(columnNames, targetColumnName) + if err != nil { + return err + } + + counter := map[string]int{} + + for { + row, err := reader.Read() + if err == io.EOF { + break + } + if err != nil { + return errors.Wrap(err, "failed to read the CSV file") + } + + val := row[targetColumnIndex] + counter[val] = counter[val] + 1 + } + + if err := writer.Write([]string{targetColumnName, countColumnName}); err != nil { + return err + } + + // グループ化した値でソートして出力 + keys := []string{} + for k := range counter { + keys = append(keys, k) + } + _sort.Strings(keys) + + for _, k := range keys { + if err := writer.Write([]string{k, strconv.Itoa(counter[k])}); err != nil { + return err + } + } + + return nil +} diff --git a/cmd/root.go b/cmd/root.go index 62443a9..47098d3 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -44,6 +44,7 @@ func newRootCmd() *cobra.Command { rootCmd.AddCommand(newSortCmd()) rootCmd.AddCommand(newSplitCmd()) rootCmd.AddCommand(newHeadCmd()) + rootCmd.AddCommand(newGcountCmd()) for _, c := range rootCmd.Commands() { // フラグ以外は受け付けないように From aa7662dea5908f4499f3e72bffcd58c30d6c3ade Mon Sep 17 00:00:00 2001 From: onozaty Date: Sun, 9 Jan 2022 20:58:49 +0900 Subject: [PATCH 2/5] =?UTF-8?q?[gcount]=20=E5=8B=95=E4=BD=9C=E7=A2=BA?= =?UTF-8?q?=E8=AA=8D=E3=81=A7=E5=8B=95=E3=81=8B=E3=81=AA=E3=81=8B=E3=81=A3?= =?UTF-8?q?=E3=81=9F=E9=83=A8=E5=88=86=E3=82=92=E4=BF=AE=E6=AD=A3=20#19?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmd/gcount.go | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/cmd/gcount.go b/cmd/gcount.go index 6a7987d..f620103 100644 --- a/cmd/gcount.go +++ b/cmd/gcount.go @@ -30,26 +30,19 @@ func newGcountCmd() *cobra.Command { // 引数の解析に成功した時点で、エラーが起きてもUsageは表示しない cmd.SilenceUsage = true - err = runGroupCount( + return runGroupCount( format, inputPath, targetColumnName, countColumnName, outputPath) - - if err != nil { - return err - } - - cmd.Printf("%d\n", count) - - return nil }, } gcountCmd.Flags().StringP("input", "i", "", "Input CSV file path.") gcountCmd.MarkFlagRequired("input") gcountCmd.Flags().StringP("column", "c", "", "Name of the column to use for grouping.") + gcountCmd.MarkFlagRequired("column") gcountCmd.Flags().StringP("count-column", "", "COUNT", "(optional) Column name for the number of records.") gcountCmd.Flags().StringP("output", "o", "", "Output CSV file path.") gcountCmd.MarkFlagRequired("output") From 75c3df2895d327064ac3a3bd3585ac3c72dcf8b2 Mon Sep 17 00:00:00 2001 From: onozaty Date: Mon, 10 Jan 2022 23:01:39 +0900 Subject: [PATCH 3/5] =?UTF-8?q?[gcount]=20=E3=83=86=E3=82=B9=E3=83=88?= =?UTF-8?q?=E8=BF=BD=E5=8A=A0=20#19?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmd/gcount_test.go | 248 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 248 insertions(+) create mode 100644 cmd/gcount_test.go diff --git a/cmd/gcount_test.go b/cmd/gcount_test.go new file mode 100644 index 0000000..711ad2c --- /dev/null +++ b/cmd/gcount_test.go @@ -0,0 +1,248 @@ +package cmd + +import ( + "os" + "testing" +) + +func TestGcountCmd(t *testing.T) { + + s := joinRows( + "col1,col2", + "1,B", + "2,A", + "3,a", + "4,A", + "5,C", + "6,C", + "7,A", + "8,AA", + "9,B", + "10,", + ) + + fi := createTempFile(t, s) + defer os.Remove(fi) + + fo := createTempFile(t, "") + defer os.Remove(fo) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "gcount", + "-i", fi, + "-o", fo, + "-c", "col2", + }) + + err := rootCmd.Execute() + if err != nil { + t.Fatal("failed test\n", err) + } + + result := readString(t, fo) + + expect := joinRows( + "col2,COUNT", + ",1", + "A,3", + "AA,1", + "B,2", + "C,2", + "a,1", + ) + + if result != expect { + t.Fatal("failed test\n", result) + } +} + +func TestGcountCmd_countColumn(t *testing.T) { + + s := joinRows( + "col1,col2", + "1,A", + "2,A", + "3,A", + ) + + fi := createTempFile(t, s) + defer os.Remove(fi) + + fo := createTempFile(t, "") + defer os.Remove(fo) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "gcount", + "-i", fi, + "-o", fo, + "-c", "col2", + "--count-column", "count", + }) + + err := rootCmd.Execute() + if err != nil { + t.Fatal("failed test\n", err) + } + + result := readString(t, fo) + + expect := joinRows( + "col2,count", + "A,3", + ) + + if result != expect { + t.Fatal("failed test\n", result) + } +} + +func TestGcountCmd_format(t *testing.T) { + + s := joinRows( + "col1\tcol2", + "1\ta", + "2\tb", + "3\ta", + "4\tb", + ) + + fi := createTempFile(t, s) + defer os.Remove(fi) + + fo := createTempFile(t, "") + defer os.Remove(fo) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "gcount", + "-i", fi, + "-o", fo, + "-c", "col2", + "--delim", `\t`, + }) + + err := rootCmd.Execute() + if err != nil { + t.Fatal("failed test\n", err) + } + + result := readString(t, fo) + + expect := joinRows( + "col2\tCOUNT", + "a\t2", + "b\t2", + ) + + if result != expect { + t.Fatal("failed test\n", result) + } +} + +func TestGcountCmd_invalidFormat(t *testing.T) { + + s := joinRows( + "col1,col2", + "1,1", + ) + + fi := createTempFile(t, s) + defer os.Remove(fi) + + fo := createTempFile(t, "") + defer os.Remove(fo) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "gcount", + "-i", fi, + "-o", fo, + "-c", "col1", + "--delim", "xx", + }) + + err := rootCmd.Execute() + if err == nil || err.Error() != "flag delim should be specified with a single character" { + t.Fatal("failed test\n", err) + } +} + +func TestGcountCmd_columnNotFound(t *testing.T) { + + s := joinRows( + "col1,col2", + "1,A", + "2,A", + "3,A", + ) + + fi := createTempFile(t, s) + defer os.Remove(fi) + + fo := createTempFile(t, "") + defer os.Remove(fo) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "gcount", + "-i", fi, + "-o", fo, + "-c", "col3", + }) + + err := rootCmd.Execute() + if err == nil || err.Error() != "missing col3 in the CSV file" { + t.Fatal("failed test\n", err) + } +} + +func TestGcountCmd_inputFileNotFound(t *testing.T) { + + fi := createTempFile(t, "") + defer os.Remove(fi) + + fo := createTempFile(t, "") + defer os.Remove(fo) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "gcount", + "-i", fi + "____", // 存在しないファイル + "-o", fo, + "-c", "col1", + }) + + err := rootCmd.Execute() + if err == nil { + t.Fatal("failed test\n", err) + } + + pathErr := err.(*os.PathError) + if pathErr.Path != fi+"____" || pathErr.Op != "open" { + t.Fatal("failed test\n", err) + } +} + +func TestGcountCmd_inputFileEmpty(t *testing.T) { + + fi := createTempFile(t, "") + defer os.Remove(fi) + + fo := createTempFile(t, "") + defer os.Remove(fo) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "gcount", + "-i", fi, + "-o", fo, + "-c", "col1", + }) + + err := rootCmd.Execute() + if err == nil || err.Error() != "failed to read the CSV file: EOF" { + t.Fatal("failed test\n", err) + } +} From 3353c9f87f9ab28a5701d240c64786c7fe7f5dcd Mon Sep 17 00:00:00 2001 From: onozaty Date: Wed, 12 Jan 2022 00:24:11 +0900 Subject: [PATCH 4/5] =?UTF-8?q?[group]=20=E3=82=B5=E3=83=96=E3=82=B3?= =?UTF-8?q?=E3=83=9E=E3=83=B3=E3=83=89=E5=90=8D=E3=82=92gcount->group?= =?UTF-8?q?=E3=81=AB=E5=A4=89=E6=9B=B4=20#19?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmd/{gcount.go => group.go} | 6 +++--- cmd/{gcount_test.go => group_test.go} | 28 +++++++++++++-------------- cmd/root.go | 2 +- 3 files changed, 18 insertions(+), 18 deletions(-) rename cmd/{gcount.go => group.go} (96%) rename cmd/{gcount_test.go => group_test.go} (88%) diff --git a/cmd/gcount.go b/cmd/group.go similarity index 96% rename from cmd/gcount.go rename to cmd/group.go index f620103..31d3fa8 100644 --- a/cmd/gcount.go +++ b/cmd/group.go @@ -10,11 +10,11 @@ import ( "github.com/spf13/cobra" ) -func newGcountCmd() *cobra.Command { +func newGroupCmd() *cobra.Command { gcountCmd := &cobra.Command{ - Use: "gcount", - Short: "Count the number of records in each group", + Use: "group", + Short: "Aggregate by group", RunE: func(cmd *cobra.Command, args []string) error { format, err := getFlagBaseCsvFormat(cmd.Flags()) diff --git a/cmd/gcount_test.go b/cmd/group_test.go similarity index 88% rename from cmd/gcount_test.go rename to cmd/group_test.go index 711ad2c..f523120 100644 --- a/cmd/gcount_test.go +++ b/cmd/group_test.go @@ -5,7 +5,7 @@ import ( "testing" ) -func TestGcountCmd(t *testing.T) { +func TestGroupCmd(t *testing.T) { s := joinRows( "col1,col2", @@ -29,7 +29,7 @@ func TestGcountCmd(t *testing.T) { rootCmd := newRootCmd() rootCmd.SetArgs([]string{ - "gcount", + "group", "-i", fi, "-o", fo, "-c", "col2", @@ -57,7 +57,7 @@ func TestGcountCmd(t *testing.T) { } } -func TestGcountCmd_countColumn(t *testing.T) { +func TestGroupCmd_countColumn(t *testing.T) { s := joinRows( "col1,col2", @@ -74,7 +74,7 @@ func TestGcountCmd_countColumn(t *testing.T) { rootCmd := newRootCmd() rootCmd.SetArgs([]string{ - "gcount", + "group", "-i", fi, "-o", fo, "-c", "col2", @@ -98,7 +98,7 @@ func TestGcountCmd_countColumn(t *testing.T) { } } -func TestGcountCmd_format(t *testing.T) { +func TestGroupCmd_format(t *testing.T) { s := joinRows( "col1\tcol2", @@ -116,7 +116,7 @@ func TestGcountCmd_format(t *testing.T) { rootCmd := newRootCmd() rootCmd.SetArgs([]string{ - "gcount", + "group", "-i", fi, "-o", fo, "-c", "col2", @@ -141,7 +141,7 @@ func TestGcountCmd_format(t *testing.T) { } } -func TestGcountCmd_invalidFormat(t *testing.T) { +func TestGroupCmd_invalidFormat(t *testing.T) { s := joinRows( "col1,col2", @@ -156,7 +156,7 @@ func TestGcountCmd_invalidFormat(t *testing.T) { rootCmd := newRootCmd() rootCmd.SetArgs([]string{ - "gcount", + "group", "-i", fi, "-o", fo, "-c", "col1", @@ -169,7 +169,7 @@ func TestGcountCmd_invalidFormat(t *testing.T) { } } -func TestGcountCmd_columnNotFound(t *testing.T) { +func TestGroupCmd_columnNotFound(t *testing.T) { s := joinRows( "col1,col2", @@ -186,7 +186,7 @@ func TestGcountCmd_columnNotFound(t *testing.T) { rootCmd := newRootCmd() rootCmd.SetArgs([]string{ - "gcount", + "group", "-i", fi, "-o", fo, "-c", "col3", @@ -198,7 +198,7 @@ func TestGcountCmd_columnNotFound(t *testing.T) { } } -func TestGcountCmd_inputFileNotFound(t *testing.T) { +func TestGroupCmd_inputFileNotFound(t *testing.T) { fi := createTempFile(t, "") defer os.Remove(fi) @@ -208,7 +208,7 @@ func TestGcountCmd_inputFileNotFound(t *testing.T) { rootCmd := newRootCmd() rootCmd.SetArgs([]string{ - "gcount", + "group", "-i", fi + "____", // 存在しないファイル "-o", fo, "-c", "col1", @@ -225,7 +225,7 @@ func TestGcountCmd_inputFileNotFound(t *testing.T) { } } -func TestGcountCmd_inputFileEmpty(t *testing.T) { +func TestGroupCmd_inputFileEmpty(t *testing.T) { fi := createTempFile(t, "") defer os.Remove(fi) @@ -235,7 +235,7 @@ func TestGcountCmd_inputFileEmpty(t *testing.T) { rootCmd := newRootCmd() rootCmd.SetArgs([]string{ - "gcount", + "group", "-i", fi, "-o", fo, "-c", "col1", diff --git a/cmd/root.go b/cmd/root.go index 47098d3..5e98421 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -44,7 +44,7 @@ func newRootCmd() *cobra.Command { rootCmd.AddCommand(newSortCmd()) rootCmd.AddCommand(newSplitCmd()) rootCmd.AddCommand(newHeadCmd()) - rootCmd.AddCommand(newGcountCmd()) + rootCmd.AddCommand(newGroupCmd()) for _, c := range rootCmd.Commands() { // フラグ以外は受け付けないように From 638ee55a0626b36408aeb4248e8c26ad2850dea7 Mon Sep 17 00:00:00 2001 From: onozaty Date: Wed, 12 Jan 2022 09:03:32 +0900 Subject: [PATCH 5/5] =?UTF-8?q?[group]=20README=E3=81=AB=E8=AA=AC=E6=98=8E?= =?UTF-8?q?=E8=BF=BD=E5=8A=A0=20#19?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/README.md b/README.md index ab66fa2..707e543 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ * [concat](#concat) Concat CSV files. * [count](#count) Count the number of records. * [exclude](#exclude) Exclude rows by included in another CSV file. +* [group](#group) Aggregate by group. * [filter](#filter) Filter rows by condition. * [head](#head) Show head few rows. * [header](#header) Show header. @@ -349,6 +350,66 @@ col1,col2 4,D ``` +## group + +Group by the value of the specified column and perform aggregation. + +Currently, only counting is supported. +It's like `GROUP BY` + `COUNT` in SQL. + +### Usage + +``` +csvt group -i INPUT -c COLUMN [--count-column COUNT_COLUMN] -o OUTPUT +``` + +``` +Usage: + csvt group [flags] + +Flags: + -i, --input string Input CSV file path. + -c, --column string Name of the column to use for grouping. + --count-column string (optional) Column name for the number of records. (default "COUNT") + -o, --output string Output CSV file path. + -h, --help help for group +``` + +### Example + +The contents of `input.csv`. + +``` +col1,col2 +1,B +2,B +3,A +4,D +5,C +6,D +7,D +8,E +9,A +10,D +``` + +Group the rows by the value of `col2` and aggregate the number of rows. + +``` +$ csvt group -i input.csv -c col2 -o output.csv +``` + +The contents of the created `output.csv`. + +``` +col2,COUNT +A,2 +B,2 +C,1 +D,4 +E,1 +``` + ## filter Create a new CSV file by filtering the input CSV file to rows that match the conditions.