diff --git a/README.md b/README.md index ab66fa2..707e543 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ * [concat](#concat) Concat CSV files. * [count](#count) Count the number of records. * [exclude](#exclude) Exclude rows by included in another CSV file. +* [group](#group) Aggregate by group. * [filter](#filter) Filter rows by condition. * [head](#head) Show head few rows. * [header](#header) Show header. @@ -349,6 +350,66 @@ col1,col2 4,D ``` +## group + +Group by the value of the specified column and perform aggregation. + +Currently, only counting is supported. +It's like `GROUP BY` + `COUNT` in SQL. + +### Usage + +``` +csvt group -i INPUT -c COLUMN [--count-column COUNT_COLUMN] -o OUTPUT +``` + +``` +Usage: + csvt group [flags] + +Flags: + -i, --input string Input CSV file path. + -c, --column string Name of the column to use for grouping. + --count-column string (optional) Column name for the number of records. (default "COUNT") + -o, --output string Output CSV file path. + -h, --help help for group +``` + +### Example + +The contents of `input.csv`. + +``` +col1,col2 +1,B +2,B +3,A +4,D +5,C +6,D +7,D +8,E +9,A +10,D +``` + +Group the rows by the value of `col2` and aggregate the number of rows. + +``` +$ csvt group -i input.csv -c col2 -o output.csv +``` + +The contents of the created `output.csv`. + +``` +col2,COUNT +A,2 +B,2 +C,1 +D,4 +E,1 +``` + ## filter Create a new CSV file by filtering the input CSV file to rows that match the conditions. diff --git a/cmd/group.go b/cmd/group.go new file mode 100644 index 0000000..31d3fa8 --- /dev/null +++ b/cmd/group.go @@ -0,0 +1,115 @@ +package cmd + +import ( + "io" + _sort "sort" + "strconv" + + "github.com/onozaty/csvt/csv" + "github.com/pkg/errors" + "github.com/spf13/cobra" +) + +func newGroupCmd() *cobra.Command { + + gcountCmd := &cobra.Command{ + Use: "group", + Short: "Aggregate by group", + RunE: func(cmd *cobra.Command, args []string) error { + + format, err := getFlagBaseCsvFormat(cmd.Flags()) + if err != nil { + return err + } + + inputPath, _ := cmd.Flags().GetString("input") + targetColumnName, _ := cmd.Flags().GetString("column") + outputPath, _ := cmd.Flags().GetString("output") + countColumnName, _ := cmd.Flags().GetString("count-column") + + // 引数の解析に成功した時点で、エラーが起きてもUsageは表示しない + cmd.SilenceUsage = true + + return runGroupCount( + format, + inputPath, + targetColumnName, + countColumnName, + outputPath) + }, + } + + gcountCmd.Flags().StringP("input", "i", "", "Input CSV file path.") + gcountCmd.MarkFlagRequired("input") + gcountCmd.Flags().StringP("column", "c", "", "Name of the column to use for grouping.") + gcountCmd.MarkFlagRequired("column") + gcountCmd.Flags().StringP("count-column", "", "COUNT", "(optional) Column name for the number of records.") + gcountCmd.Flags().StringP("output", "o", "", "Output CSV file path.") + gcountCmd.MarkFlagRequired("output") + + return gcountCmd +} + +func runGroupCount(format csv.Format, inputPath string, targetColumnName string, countColumnName string, outputPath string) error { + + reader, writer, close, err := setupInputOutput(inputPath, outputPath, format) + if err != nil { + return err + } + defer close() + + err = groupCount(reader, targetColumnName, countColumnName, writer) + if err != nil { + return err + } + + return writer.Flush() +} + +func groupCount(reader csv.CsvReader, targetColumnName string, countColumnName string, writer csv.CsvWriter) error { + + // ヘッダ + columnNames, err := reader.Read() + if err != nil { + return errors.Wrap(err, "failed to read the CSV file") + } + + targetColumnIndex, err := getTargetColumnIndex(columnNames, targetColumnName) + if err != nil { + return err + } + + counter := map[string]int{} + + for { + row, err := reader.Read() + if err == io.EOF { + break + } + if err != nil { + return errors.Wrap(err, "failed to read the CSV file") + } + + val := row[targetColumnIndex] + counter[val] = counter[val] + 1 + } + + if err := writer.Write([]string{targetColumnName, countColumnName}); err != nil { + return err + } + + // グループ化した値でソートして出力 + keys := []string{} + for k := range counter { + keys = append(keys, k) + } + _sort.Strings(keys) + + for _, k := range keys { + if err := writer.Write([]string{k, strconv.Itoa(counter[k])}); err != nil { + return err + } + } + + return nil +} diff --git a/cmd/group_test.go b/cmd/group_test.go new file mode 100644 index 0000000..f523120 --- /dev/null +++ b/cmd/group_test.go @@ -0,0 +1,248 @@ +package cmd + +import ( + "os" + "testing" +) + +func TestGroupCmd(t *testing.T) { + + s := joinRows( + "col1,col2", + "1,B", + "2,A", + "3,a", + "4,A", + "5,C", + "6,C", + "7,A", + "8,AA", + "9,B", + "10,", + ) + + fi := createTempFile(t, s) + defer os.Remove(fi) + + fo := createTempFile(t, "") + defer os.Remove(fo) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "group", + "-i", fi, + "-o", fo, + "-c", "col2", + }) + + err := rootCmd.Execute() + if err != nil { + t.Fatal("failed test\n", err) + } + + result := readString(t, fo) + + expect := joinRows( + "col2,COUNT", + ",1", + "A,3", + "AA,1", + "B,2", + "C,2", + "a,1", + ) + + if result != expect { + t.Fatal("failed test\n", result) + } +} + +func TestGroupCmd_countColumn(t *testing.T) { + + s := joinRows( + "col1,col2", + "1,A", + "2,A", + "3,A", + ) + + fi := createTempFile(t, s) + defer os.Remove(fi) + + fo := createTempFile(t, "") + defer os.Remove(fo) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "group", + "-i", fi, + "-o", fo, + "-c", "col2", + "--count-column", "count", + }) + + err := rootCmd.Execute() + if err != nil { + t.Fatal("failed test\n", err) + } + + result := readString(t, fo) + + expect := joinRows( + "col2,count", + "A,3", + ) + + if result != expect { + t.Fatal("failed test\n", result) + } +} + +func TestGroupCmd_format(t *testing.T) { + + s := joinRows( + "col1\tcol2", + "1\ta", + "2\tb", + "3\ta", + "4\tb", + ) + + fi := createTempFile(t, s) + defer os.Remove(fi) + + fo := createTempFile(t, "") + defer os.Remove(fo) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "group", + "-i", fi, + "-o", fo, + "-c", "col2", + "--delim", `\t`, + }) + + err := rootCmd.Execute() + if err != nil { + t.Fatal("failed test\n", err) + } + + result := readString(t, fo) + + expect := joinRows( + "col2\tCOUNT", + "a\t2", + "b\t2", + ) + + if result != expect { + t.Fatal("failed test\n", result) + } +} + +func TestGroupCmd_invalidFormat(t *testing.T) { + + s := joinRows( + "col1,col2", + "1,1", + ) + + fi := createTempFile(t, s) + defer os.Remove(fi) + + fo := createTempFile(t, "") + defer os.Remove(fo) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "group", + "-i", fi, + "-o", fo, + "-c", "col1", + "--delim", "xx", + }) + + err := rootCmd.Execute() + if err == nil || err.Error() != "flag delim should be specified with a single character" { + t.Fatal("failed test\n", err) + } +} + +func TestGroupCmd_columnNotFound(t *testing.T) { + + s := joinRows( + "col1,col2", + "1,A", + "2,A", + "3,A", + ) + + fi := createTempFile(t, s) + defer os.Remove(fi) + + fo := createTempFile(t, "") + defer os.Remove(fo) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "group", + "-i", fi, + "-o", fo, + "-c", "col3", + }) + + err := rootCmd.Execute() + if err == nil || err.Error() != "missing col3 in the CSV file" { + t.Fatal("failed test\n", err) + } +} + +func TestGroupCmd_inputFileNotFound(t *testing.T) { + + fi := createTempFile(t, "") + defer os.Remove(fi) + + fo := createTempFile(t, "") + defer os.Remove(fo) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "group", + "-i", fi + "____", // 存在しないファイル + "-o", fo, + "-c", "col1", + }) + + err := rootCmd.Execute() + if err == nil { + t.Fatal("failed test\n", err) + } + + pathErr := err.(*os.PathError) + if pathErr.Path != fi+"____" || pathErr.Op != "open" { + t.Fatal("failed test\n", err) + } +} + +func TestGroupCmd_inputFileEmpty(t *testing.T) { + + fi := createTempFile(t, "") + defer os.Remove(fi) + + fo := createTempFile(t, "") + defer os.Remove(fo) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "group", + "-i", fi, + "-o", fo, + "-c", "col1", + }) + + err := rootCmd.Execute() + if err == nil || err.Error() != "failed to read the CSV file: EOF" { + t.Fatal("failed test\n", err) + } +} diff --git a/cmd/root.go b/cmd/root.go index 62443a9..5e98421 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -44,6 +44,7 @@ func newRootCmd() *cobra.Command { rootCmd.AddCommand(newSortCmd()) rootCmd.AddCommand(newSplitCmd()) rootCmd.AddCommand(newHeadCmd()) + rootCmd.AddCommand(newGroupCmd()) for _, c := range rootCmd.Commands() { // フラグ以外は受け付けないように