From 6ac7fa01018880a84e73ac4b9d6b6f8b2b4b79e8 Mon Sep 17 00:00:00 2001 From: onozaty Date: Mon, 5 Jul 2021 23:48:36 +0900 Subject: [PATCH 1/4] =?UTF-8?q?[join]=20column2=E3=83=95=E3=83=A9=E3=82=B0?= =?UTF-8?q?=E3=82=92column-second=E3=81=AB=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 20 ++++++++++---------- cmd/filter.go | 2 +- cmd/join.go | 4 ++-- cmd/join_test.go | 4 ++-- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 6d9b273..0d87302 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ * [choose](#choose) Choose columns. * [count](#count) Count the number of records. -* [filter](#filter) Filter rows. +* [filter](#filter) Filter rows by condition. * [header](#header) Show header. * [join](#join) Join CSV files. * [remove](#remove) Remove columns. @@ -304,14 +304,14 @@ Usage: csvt join [flags] Flags: - -1, --first string First CSV file path. - -2, --second string Second CSV file path. - -c, --column string Name of the column to use for joining. - --column2 string (optional) Name of the column to use for joining in the second CSV file. Specify if different from the first CSV file. - -o, --output string Output CSV file path. - --usingfile (optional) Use temporary files for joining. Use this when joining large files that will not fit in memory. - --norecord (optional) No error even if there is no record corresponding to sencod CSV. - -h, --help help for join + -1, --first string First CSV file path. + -2, --second string Second CSV file path. + -c, --column string Name of the column to use for joining. + --column-second string (optional) Name of the column to use for joining in the second CSV file. Specify if different from the first CSV file. + -o, --output string Output CSV file path. + --usingfile (optional) Use temporary files for joining. Use this when joining large files that will not fit in memory. + --norecord (optional) No error even if there is no record corresponding to sencod CSV. + -h, --help help for join ``` ### Example @@ -366,7 +366,7 @@ If you don't want to raise an error even if there is no value, specify `--noreco $ csvt join -1 input1.csv -2 input2.csv -c CompanyID -o output.csv --norecord ``` -If the column name in the second CSV file is different from that in the first CSV file, specify it with `--column2`. +If the column name in the second CSV file is different from that in the first CSV file, specify it with `--column-second`. ``` $ csvt join -1 input1.csv -2 input2.csv -c CompanyID --column2 ID -o output.csv diff --git a/cmd/filter.go b/cmd/filter.go index 617b69e..ae92d2b 100644 --- a/cmd/filter.go +++ b/cmd/filter.go @@ -14,7 +14,7 @@ func newFilterCmd() *cobra.Command { filterCmd := &cobra.Command{ Use: "filter", - Short: "Filter rows", + Short: "Filter rows by condition", RunE: func(cmd *cobra.Command, args []string) error { format, err := getFlagBaseCsvFormat(cmd.Flags()) diff --git a/cmd/join.go b/cmd/join.go index ac296d5..9d2980b 100644 --- a/cmd/join.go +++ b/cmd/join.go @@ -27,7 +27,7 @@ func newJoinCmd() *cobra.Command { joinColumnName, _ := cmd.Flags().GetString("column") outputPath, _ := cmd.Flags().GetString("output") - secondJoinColumnName, _ := cmd.Flags().GetString("column2") + secondJoinColumnName, _ := cmd.Flags().GetString("column-second") useFileTable, _ := cmd.Flags().GetBool("usingfile") noRecordNoError, _ := cmd.Flags().GetBool("norecord") joinOptions := JoinOptions{ @@ -49,7 +49,7 @@ func newJoinCmd() *cobra.Command { joinCmd.MarkFlagRequired("second") joinCmd.Flags().StringP("column", "c", "", "Name of the column to use for joining.") joinCmd.MarkFlagRequired("column") - joinCmd.Flags().StringP("column2", "", "", "(optional) Name of the column to use for joining in the second CSV file. Specify if different from the first CSV file.") + joinCmd.Flags().StringP("column-second", "", "", "(optional) Name of the column to use for joining in the second CSV file. Specify if different from the first CSV file.") joinCmd.Flags().StringP("output", "o", "", "Output CSV file path.") joinCmd.MarkFlagRequired("output") joinCmd.Flags().BoolP("usingfile", "", false, "(optional) Use temporary files for joining. Use this when joining large files that will not fit in memory.") diff --git a/cmd/join_test.go b/cmd/join_test.go index a2632c5..06b82a9 100644 --- a/cmd/join_test.go +++ b/cmd/join_test.go @@ -223,7 +223,7 @@ func TestRunJoin_norecord(t *testing.T) { } } -func TestRunJoin_column2(t *testing.T) { +func TestRunJoin_columnSecond(t *testing.T) { s1 := `ID,Name,CompanyID 1,Yamada,1 @@ -252,7 +252,7 @@ func TestRunJoin_column2(t *testing.T) { "-2", f2.Name(), "-o", fo.Name(), "-c", "CompanyID", - "--column2", "ID", + "--column-second", "ID", }) err := rootCmd.Execute() From 11b2934bbd22aa422719abcc5c4fa92a7ef48c86 Mon Sep 17 00:00:00 2001 From: onozaty Date: Tue, 6 Jul 2021 09:04:34 +0900 Subject: [PATCH 2/4] =?UTF-8?q?[include]=20include=E3=82=B3=E3=83=9E?= =?UTF-8?q?=E3=83=B3=E3=83=89=E5=AE=9F=E8=A3=85=20#8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 58 ++++++ cmd/include.go | 129 +++++++++++++ cmd/include_test.go | 432 ++++++++++++++++++++++++++++++++++++++++++++ cmd/root.go | 1 + csv/itemset.go | 61 +++++++ csv/itemset_test.go | 97 ++++++++++ 6 files changed, 778 insertions(+) create mode 100644 cmd/include.go create mode 100644 cmd/include_test.go create mode 100644 csv/itemset.go create mode 100644 csv/itemset_test.go diff --git a/README.md b/README.md index 0d87302..4e233d1 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ * [count](#count) Count the number of records. * [filter](#filter) Filter rows by condition. * [header](#header) Show header. +* [include](#include) Filter rows by included in another CSV file. * [join](#join) Join CSV files. * [remove](#remove) Remove columns. * [rename](#rename) Rename columns. @@ -286,6 +287,63 @@ Age CompanyID ``` +## include + +Create a new CSV file by filtering on the rows included in another CSV file. + +### Usage + +``` +csvt include -i INPUT -c COLUMN -a ANOTHER [--column-another COLUMN2] -o OUTPUT +``` + +``` +Usage: + csvt include [flags] + +Flags: + -i, --input string Input CSV file path. + -c, --column string Name of the column to use for filtering. + -a, --another string Another CSV file path. Filter by included in this CSV file. + --column-another string (optional) Name of the column to use for filtering in the another CSV file. Specify if different from the input CSV file. + -o, --output string Output CSV file path. + -h, --help help for include +``` + +### Example + +The contents of `input.csv`. + +``` +col1,col2 +1,A +2,B +3,C +4,D +``` + +The contents of `another.csv`. + +``` +col1,col3 +2,2 +3,2 +``` + +Filter by "col1" values in `another.csv`. + +``` +$ csvt include -i input.csv -c col1 -a another.csv -o output.csv +``` + +The contents of the created `output.csv`. + +``` +col1,col2 +2,B +3,C +``` + ## join Join CSV files. diff --git a/cmd/include.go b/cmd/include.go new file mode 100644 index 0000000..2e65cf3 --- /dev/null +++ b/cmd/include.go @@ -0,0 +1,129 @@ +package cmd + +import ( + "fmt" + "io" + + "github.com/onozaty/csvt/csv" + "github.com/onozaty/csvt/util" + "github.com/pkg/errors" + "github.com/spf13/cobra" +) + +func newIncludeCmd() *cobra.Command { + + includeCmd := &cobra.Command{ + Use: "include", + Short: "Filter rows by included in another CSV file", + RunE: func(cmd *cobra.Command, args []string) error { + + format, err := getFlagBaseCsvFormat(cmd.Flags()) + if err != nil { + return err + } + + inputPath, _ := cmd.Flags().GetString("input") + targetColumnName, _ := cmd.Flags().GetString("column") + anotherPath, _ := cmd.Flags().GetString("another") + anotherColumnName, _ := cmd.Flags().GetString("column-another") + outputPath, _ := cmd.Flags().GetString("output") + + // 引数の解析に成功した時点で、エラーが起きてもUsageは表示しない + cmd.SilenceUsage = true + + return runInclude( + format, + inputPath, + targetColumnName, + anotherPath, + outputPath, + IncludeOptions{ + anotherColumnName: anotherColumnName, + }) + }, + } + + includeCmd.Flags().StringP("input", "i", "", "Input CSV file path.") + includeCmd.MarkFlagRequired("input") + includeCmd.Flags().StringP("column", "c", "", "Name of the column to use for filtering.") + includeCmd.MarkFlagRequired("column") + includeCmd.Flags().StringP("another", "a", "", "Another CSV file path. Filter by included in this CSV file.") + includeCmd.MarkFlagRequired("another") + includeCmd.Flags().StringP("column-another", "", "", "(optional) Name of the column to use for filtering in the another CSV file. Specify if different from the input CSV file.") + includeCmd.Flags().StringP("output", "o", "", "Output CSV file path.") + includeCmd.MarkFlagRequired("output") + + return includeCmd +} + +type IncludeOptions struct { + anotherColumnName string +} + +func runInclude(format csv.Format, inputPath string, targetColumnName string, anotherPath string, outputPath string, options IncludeOptions) error { + + reader, writer, close, err := setupInputOutput(inputPath, outputPath, format) + if err != nil { + return err + } + defer close() + + anotherReader, anotherClose, err := setupInput(anotherPath, format) + if err != nil { + return err + } + defer anotherClose() + + err = include(reader, targetColumnName, anotherReader, writer, options) + if err != nil { + return err + } + + return writer.Flush() +} + +func include(reader csv.CsvReader, targetColumnName string, anotherReader csv.CsvReader, writer csv.CsvWriter, options IncludeOptions) error { + + inputTargetColumnName := targetColumnName + anotherTargetColumnName := targetColumnName + if options.anotherColumnName != "" { + anotherTargetColumnName = options.anotherColumnName + } + + inputColumnNames, err := reader.Read() + if err != nil { + return errors.Wrap(err, "failed to read the input CSV file") + } + inputTargetColumnIndex := util.IndexOf(inputColumnNames, inputTargetColumnName) + if inputTargetColumnIndex == -1 { + return fmt.Errorf("missing %s in the input CSV file", inputTargetColumnName) + } + + anotherItemSet, err := csv.LoadItemSet(anotherReader, anotherTargetColumnName) + if err != nil { + return errors.Wrap(err, "failed to read the another CSV file") + } + + writer.Write(inputColumnNames) + + for { + row, err := reader.Read() + if err == io.EOF { + break + } + if err != nil { + return errors.Wrap(err, "failed to read the input CSV file") + } + + // 比較対象のCSV内に存在した場合は出力 + if anotherItemSet.Contains(row[inputTargetColumnIndex]) { + + err = writer.Write(row) + if err != nil { + return err + } + } + } + + return nil +} diff --git a/cmd/include_test.go b/cmd/include_test.go new file mode 100644 index 0000000..10a1c11 --- /dev/null +++ b/cmd/include_test.go @@ -0,0 +1,432 @@ +package cmd + +import ( + "os" + "testing" +) + +func TestIncludeCmd(t *testing.T) { + + si := `col1,col2 +1,2 +2,3 +3,4 +4,5 +` + fi := createTempFile(t, si) + defer os.Remove(fi.Name()) + + sa := `col1,col2 +2,x +3,y +` + fa := createTempFile(t, sa) + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "include", + "-i", fi.Name(), + "-a", fa.Name(), + "-c", "col1", + "-o", fo.Name(), + }) + + err := rootCmd.Execute() + if err != nil { + t.Fatal("failed test\n", err) + } + + result := readString(t, fo.Name()) + + expect := joinRows( + "col1,col2", + "2,3", + "3,4", + ) + + if result != expect { + t.Fatal("failed test\n", result) + } +} + +func TestIncludeCmd_columnAnother(t *testing.T) { + + si := `col1,col2 +1,2 +2,3 +3,4 +4,5 +` + fi := createTempFile(t, si) + defer os.Remove(fi.Name()) + + sa := `col1,col2 +2,3 +3,4 +` + fa := createTempFile(t, sa) + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "include", + "-i", fi.Name(), + "-a", fa.Name(), + "-c", "col1", + "--column-another", "col2", + "-o", fo.Name(), + }) + + err := rootCmd.Execute() + if err != nil { + t.Fatal("failed test\n", err) + } + + result := readString(t, fo.Name()) + + expect := joinRows( + "col1,col2", + "3,4", + "4,5", + ) + + if result != expect { + t.Fatal("failed test\n", result) + } +} + +func TestIncludeCmd_duplicate(t *testing.T) { + + si := `col1,col2 +1,2 +2,3 +1,x +3,4 +` + fi := createTempFile(t, si) + defer os.Remove(fi.Name()) + + sa := `col1,col2 +1,x +1,y +` + fa := createTempFile(t, sa) + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "include", + "-i", fi.Name(), + "-a", fa.Name(), + "-c", "col1", + "-o", fo.Name(), + }) + + err := rootCmd.Execute() + if err != nil { + t.Fatal("failed test\n", err) + } + + result := readString(t, fo.Name()) + + expect := joinRows( + "col1,col2", + "1,2", + "1,x", + ) + + if result != expect { + t.Fatal("failed test\n", result) + } +} + +func TestIncludeCmd_unmatch(t *testing.T) { + + si := `col1,col2 +1,2 +2,3 +3,4 +` + fi := createTempFile(t, si) + defer os.Remove(fi.Name()) + + sa := `col1 +4 +11 +` + fa := createTempFile(t, sa) + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "include", + "-i", fi.Name(), + "-a", fa.Name(), + "-c", "col1", + "-o", fo.Name(), + }) + + err := rootCmd.Execute() + if err != nil { + t.Fatal("failed test\n", err) + } + + result := readString(t, fo.Name()) + + expect := joinRows( + "col1,col2", + ) + + if result != expect { + t.Fatal("failed test\n", result) + } +} + +func TestIncludeCmd_format(t *testing.T) { + + si := `col1 col2 +1 2 +2 3 +3 4 +` + fi := createTempFile(t, si) + defer os.Remove(fi.Name()) + + sa := `col1 col2 +2 +3 +` + fa := createTempFile(t, sa) + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "include", + "-i", fi.Name(), + "-a", fa.Name(), + "-c", "col1", + "-o", fo.Name(), + "--delim", `\t`, + }) + + err := rootCmd.Execute() + if err != nil { + t.Fatal("failed test\n", err) + } + + result := readString(t, fo.Name()) + + expect := joinRows( + "col1 col2", + "2 3", + "3 4", + ) + + if result != expect { + t.Fatal("failed test\n", result) + } +} + +func TestIncludeCmd_invalidFormat(t *testing.T) { + + fi := createTempFile(t, "") + defer os.Remove(fi.Name()) + + fa := createTempFile(t, "") + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "include", + "-i", fi.Name(), + "-a", fa.Name(), + "-c", "col1", + "-o", fo.Name(), + "--delim", "\t\t", + }) + + err := rootCmd.Execute() + if err == nil || err.Error() != "flag delim should be specified with a single character" { + t.Fatal("failed test\n", err) + } +} + +func TestIncludeCmd_inputColumnNotFound(t *testing.T) { + + si := `col1,col2 +1,2 +2,3 +` + fi := createTempFile(t, si) + defer os.Remove(fi.Name()) + + sa := `col1,col2 +1,x +1,y +` + fa := createTempFile(t, sa) + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "include", + "-i", fi.Name(), + "-a", fa.Name(), + "-c", "col3", + "-o", fo.Name(), + }) + + err := rootCmd.Execute() + if err == nil || err.Error() != "missing col3 in the input CSV file" { + t.Fatal("failed test\n", err) + } +} + +func TestIncludeCmd_anthorColumnNotFound(t *testing.T) { + + si := `col1,col2 +1,2 +2,3 +` + fi := createTempFile(t, si) + defer os.Remove(fi.Name()) + + sa := `col1,col2 +1,x +1,y +` + fa := createTempFile(t, sa) + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "include", + "-i", fi.Name(), + "-a", fa.Name(), + "-c", "col1", + "--column-another", "col3", + "-o", fo.Name(), + }) + + err := rootCmd.Execute() + if err == nil || err.Error() != "failed to read the another CSV file: col3 is not found" { + t.Fatal("failed test\n", err) + } +} + +func TestIncludeCmd_inputEmpty(t *testing.T) { + + fi := createTempFile(t, "") + defer os.Remove(fi.Name()) + + sa := `col1,col2 +1,x +1,y +` + fa := createTempFile(t, sa) + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "include", + "-i", fi.Name(), + "-a", fa.Name(), + "-c", "col1", + "-o", fo.Name(), + }) + + err := rootCmd.Execute() + if err == nil || err.Error() != "failed to read the input CSV file: EOF" { + t.Fatal("failed test\n", err) + } +} + +func TestIncludeCmd_inputFileNotFound(t *testing.T) { + + fi := createTempFile(t, "") + defer os.Remove(fi.Name()) + + fa := createTempFile(t, "") + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "include", + "-i", fi.Name() + "____", // 存在しないファイル + "-a", fa.Name(), + "-c", "col1", + "-o", fo.Name(), + }) + + err := rootCmd.Execute() + if err == nil { + t.Fatal("failed test\n", err) + } + + pathErr := err.(*os.PathError) + if pathErr.Path != fi.Name()+"____" || pathErr.Op != "open" { + t.Fatal("failed test\n", err) + } +} + +func TestIncludeCmd_anotherFileNotFound(t *testing.T) { + + fi := createTempFile(t, "") + defer os.Remove(fi.Name()) + + fa := createTempFile(t, "") + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "include", + "-i", fi.Name(), + "-a", fa.Name() + "____", // 存在しないファイル + "-c", "col1", + "-o", fo.Name(), + }) + + err := rootCmd.Execute() + if err == nil { + t.Fatal("failed test\n", err) + } + + pathErr := err.(*os.PathError) + if pathErr.Path != fa.Name()+"____" || pathErr.Op != "open" { + t.Fatal("failed test\n", err) + } +} diff --git a/cmd/root.go b/cmd/root.go index 4090afc..62a822a 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -36,6 +36,7 @@ func newRootCmd() *cobra.Command { rootCmd.AddCommand(newTransformCmd()) rootCmd.AddCommand(newReplaceCmd()) rootCmd.AddCommand(newUniqueCmd()) + rootCmd.AddCommand(newIncludeCmd()) for _, c := range rootCmd.Commands() { // フラグ以外は受け付けないように diff --git a/csv/itemset.go b/csv/itemset.go new file mode 100644 index 0000000..ab82585 --- /dev/null +++ b/csv/itemset.go @@ -0,0 +1,61 @@ +package csv + +import ( + "fmt" + "io" + + "github.com/onozaty/csvt/util" +) + +type ItemSet struct { + items map[string]struct{} +} + +// 入れておく値は何でも良い +var itemValue = struct{}{} + +func (hashset *ItemSet) Add(item string) { + hashset.items[item] = itemValue +} + +func (hashset *ItemSet) Contains(item string) bool { + _, contains := hashset.items[item] + return contains +} + +func (hashset *ItemSet) Count() int { + return len(hashset.items) +} + +func NewItemSet() *ItemSet { + return &ItemSet{ + items: make(map[string]struct{}), + } +} + +func LoadItemSet(reader CsvReader, targetColumnName string) (*ItemSet, error) { + + columnNames, err := reader.Read() + if err != nil { + return nil, err + } + targetColumnIndex := util.IndexOf(columnNames, targetColumnName) + if targetColumnIndex == -1 { + return nil, fmt.Errorf("%s is not found", targetColumnName) + } + + itemSet := NewItemSet() + + for { + row, err := reader.Read() + if err == io.EOF { + break + } + if err != nil { + return nil, err + } + itemSet.Add(row[targetColumnIndex]) + } + + return itemSet, nil +} diff --git a/csv/itemset_test.go b/csv/itemset_test.go new file mode 100644 index 0000000..b2bf7b8 --- /dev/null +++ b/csv/itemset_test.go @@ -0,0 +1,97 @@ +package csv + +import ( + "strings" + "testing" +) + +func TestNewItemSet(t *testing.T) { + + itemset := NewItemSet() + if itemset.Count() != 0 { + t.Fatal("failed test\n", itemset.Count()) + } + if itemset.Contains("aa") { + t.Fatal("failed test\n") + } + + itemset.Add("aa") + if itemset.Count() != 1 { + t.Fatal("failed test\n", itemset.Count()) + } + if !itemset.Contains("aa") { + t.Fatal("failed test\n") + } + if itemset.Contains("a") { + t.Fatal("failed test\n") + } + + // 同じものを追加 + itemset.Add("aa") + if itemset.Count() != 1 { // 数は増えない + t.Fatal("failed test\n", itemset.Count()) + } + if !itemset.Contains("aa") { + t.Fatal("failed test\n") + } + if itemset.Contains("a") { + t.Fatal("failed test\n") + } + + itemset.Add("a") + if itemset.Count() != 2 { + t.Fatal("failed test\n", itemset.Count()) + } + if !itemset.Contains("aa") { + t.Fatal("failed test\n") + } + if !itemset.Contains("a") { + t.Fatal("failed test\n") + } +} + +func TestLoadItemSet(t *testing.T) { + + s := `col1,col2 +1,2 +2,3 +3,3 +4,1 +` + + r := NewCsvReader(strings.NewReader(s), Format{}) + + itemset, err := LoadItemSet(r, "col2") + if err != nil { + t.Fatal("failed test\n", err) + } + + if itemset.Count() != 3 { + t.Fatal("failed test\n", itemset.Count()) + } + if !itemset.Contains("1") { + t.Fatal("failed test\n") + } + if !itemset.Contains("2") { + t.Fatal("failed test\n") + } + if !itemset.Contains("3") { + t.Fatal("failed test\n") + } + if itemset.Contains("4") { + t.Fatal("failed test\n") + } +} + +func TestLoadItemSet_columnNotFound(t *testing.T) { + + s := `col1,col2 +1,2 +` + r := NewCsvReader(strings.NewReader(s), Format{}) + + _, err := LoadItemSet(r, "col3") + if err == nil || err.Error() != "col3 is not found" { + t.Fatal("failed test\n", err) + } +} From 4f46e78bd1baa2b7f74398f418639291117d46a4 Mon Sep 17 00:00:00 2001 From: onozaty Date: Wed, 7 Jul 2021 00:49:03 +0900 Subject: [PATCH 3/4] =?UTF-8?q?[exclude]=20exclude=E3=82=B3=E3=83=9E?= =?UTF-8?q?=E3=83=B3=E3=83=89=E5=AE=9F=E8=A3=85=20#7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 58 ++++++ cmd/exclude.go | 129 ++++++++++++ cmd/exclude_test.go | 481 ++++++++++++++++++++++++++++++++++++++++++++ cmd/include_test.go | 52 ++++- cmd/root.go | 1 + 5 files changed, 720 insertions(+), 1 deletion(-) create mode 100644 cmd/exclude.go create mode 100644 cmd/exclude_test.go diff --git a/README.md b/README.md index 4e233d1..beac284 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ `csvt` consists of multiple subcommands. * [choose](#choose) Choose columns. +* [exclude](#exclude) Exclude rows by included in another CSV file. * [count](#count) Count the number of records. * [filter](#filter) Filter rows by condition. * [header](#header) Show header. @@ -88,6 +89,63 @@ Smith,30 Jun,22 ``` +## exclude + +Create a new CSV file by exclude on the rows included in another CSV file. + +### Usage + +``` +csvt exclude -i INPUT -c COLUMN -a ANOTHER [--column-another COLUMN2] -o OUTPUT +``` + +``` +Usage: + csvt exclude [flags] + +Flags: + -i, --input string Input CSV file path. + -c, --column string Name of the column to use for exclude. + -a, --another string Another CSV file path. Exclude by included in this CSV file. + --column-another string (optional) Name of the column to use for exclude in the another CSV file. Specify if different from the input CSV file. + -o, --output string Output CSV file path. + -h, --help help for exclude +``` + +### Example + +The contents of `input.csv`. + +``` +col1,col2 +1,A +2,B +3,C +4,D +``` + +The contents of `another.csv`. + +``` +col1,col3 +2,2 +3,2 +``` + +Exclude by "col1" values in `another.csv`. + +``` +$ csvt exclude -i input.csv -c col1 -a another.csv -o output.csv +``` + +The contents of the created `output.csv`. + +``` +col1,col2 +1,A +4,D +``` + ## count Count the number of records in CSV file. diff --git a/cmd/exclude.go b/cmd/exclude.go new file mode 100644 index 0000000..59ea28e --- /dev/null +++ b/cmd/exclude.go @@ -0,0 +1,129 @@ +package cmd + +import ( + "fmt" + "io" + + "github.com/onozaty/csvt/csv" + "github.com/onozaty/csvt/util" + "github.com/pkg/errors" + "github.com/spf13/cobra" +) + +func newExcludeCmd() *cobra.Command { + + excludeCmd := &cobra.Command{ + Use: "exclude", + Short: "Exclude rows by included in another CSV file", + RunE: func(cmd *cobra.Command, args []string) error { + + format, err := getFlagBaseCsvFormat(cmd.Flags()) + if err != nil { + return err + } + + inputPath, _ := cmd.Flags().GetString("input") + targetColumnName, _ := cmd.Flags().GetString("column") + anotherPath, _ := cmd.Flags().GetString("another") + anotherColumnName, _ := cmd.Flags().GetString("column-another") + outputPath, _ := cmd.Flags().GetString("output") + + // 引数の解析に成功した時点で、エラーが起きてもUsageは表示しない + cmd.SilenceUsage = true + + return runExclude( + format, + inputPath, + targetColumnName, + anotherPath, + outputPath, + ExcludeOptions{ + anotherColumnName: anotherColumnName, + }) + }, + } + + excludeCmd.Flags().StringP("input", "i", "", "Input CSV file path.") + excludeCmd.MarkFlagRequired("input") + excludeCmd.Flags().StringP("column", "c", "", "Name of the column to use for exclude.") + excludeCmd.MarkFlagRequired("column") + excludeCmd.Flags().StringP("another", "a", "", "Another CSV file path. Exclude by included in this CSV file.") + excludeCmd.MarkFlagRequired("another") + excludeCmd.Flags().StringP("column-another", "", "", "(optional) Name of the column to use for exclude in the another CSV file. Specify if different from the input CSV file.") + excludeCmd.Flags().StringP("output", "o", "", "Output CSV file path.") + excludeCmd.MarkFlagRequired("output") + + return excludeCmd +} + +type ExcludeOptions struct { + anotherColumnName string +} + +func runExclude(format csv.Format, inputPath string, targetColumnName string, anotherPath string, outputPath string, options ExcludeOptions) error { + + reader, writer, close, err := setupInputOutput(inputPath, outputPath, format) + if err != nil { + return err + } + defer close() + + anotherReader, anotherClose, err := setupInput(anotherPath, format) + if err != nil { + return err + } + defer anotherClose() + + err = exclude(reader, targetColumnName, anotherReader, writer, options) + if err != nil { + return err + } + + return writer.Flush() +} + +func exclude(reader csv.CsvReader, targetColumnName string, anotherReader csv.CsvReader, writer csv.CsvWriter, options ExcludeOptions) error { + + inputTargetColumnName := targetColumnName + anotherTargetColumnName := targetColumnName + if options.anotherColumnName != "" { + anotherTargetColumnName = options.anotherColumnName + } + + inputColumnNames, err := reader.Read() + if err != nil { + return errors.Wrap(err, "failed to read the input CSV file") + } + inputTargetColumnIndex := util.IndexOf(inputColumnNames, inputTargetColumnName) + if inputTargetColumnIndex == -1 { + return fmt.Errorf("missing %s in the input CSV file", inputTargetColumnName) + } + + anotherItemSet, err := csv.LoadItemSet(anotherReader, anotherTargetColumnName) + if err != nil { + return errors.Wrap(err, "failed to read the another CSV file") + } + + writer.Write(inputColumnNames) + + for { + row, err := reader.Read() + if err == io.EOF { + break + } + if err != nil { + return errors.Wrap(err, "failed to read the input CSV file") + } + + // 比較対象のCSV内に存在ない場合は出力 + if !anotherItemSet.Contains(row[inputTargetColumnIndex]) { + + err = writer.Write(row) + if err != nil { + return err + } + } + } + + return nil +} diff --git a/cmd/exclude_test.go b/cmd/exclude_test.go new file mode 100644 index 0000000..2d2f6ec --- /dev/null +++ b/cmd/exclude_test.go @@ -0,0 +1,481 @@ +package cmd + +import ( + "os" + "testing" +) + +func TestExcludeCmd(t *testing.T) { + + si := `col1,col2 +1,2 +2,3 +3,4 +4,5 +` + fi := createTempFile(t, si) + defer os.Remove(fi.Name()) + + sa := `col1,col2 +2,x +3,y +` + fa := createTempFile(t, sa) + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "exclude", + "-i", fi.Name(), + "-a", fa.Name(), + "-c", "col1", + "-o", fo.Name(), + }) + + err := rootCmd.Execute() + if err != nil { + t.Fatal("failed test\n", err) + } + + result := readString(t, fo.Name()) + + expect := joinRows( + "col1,col2", + "1,2", + "4,5", + ) + + if result != expect { + t.Fatal("failed test\n", result) + } +} + +func TestExcludeCmd_columnAnother(t *testing.T) { + + si := `col1,col2 +1,2 +2,3 +3,4 +4,5 +` + fi := createTempFile(t, si) + defer os.Remove(fi.Name()) + + sa := `col1,col2 +2,3 +3,4 +` + fa := createTempFile(t, sa) + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "exclude", + "-i", fi.Name(), + "-a", fa.Name(), + "-c", "col1", + "--column-another", "col2", + "-o", fo.Name(), + }) + + err := rootCmd.Execute() + if err != nil { + t.Fatal("failed test\n", err) + } + + result := readString(t, fo.Name()) + + expect := joinRows( + "col1,col2", + "1,2", + "2,3", + ) + + if result != expect { + t.Fatal("failed test\n", result) + } +} + +func TestExcludeCmd_duplicate(t *testing.T) { + + si := `col1,col2 +1,2 +2,3 +1,x +3,4 +` + fi := createTempFile(t, si) + defer os.Remove(fi.Name()) + + sa := `col1,col2 +1,x +1,y +` + fa := createTempFile(t, sa) + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "exclude", + "-i", fi.Name(), + "-a", fa.Name(), + "-c", "col1", + "-o", fo.Name(), + }) + + err := rootCmd.Execute() + if err != nil { + t.Fatal("failed test\n", err) + } + + result := readString(t, fo.Name()) + + expect := joinRows( + "col1,col2", + "2,3", + "3,4", + ) + + if result != expect { + t.Fatal("failed test\n", result) + } +} + +func TestExcludeCmd_match_none(t *testing.T) { + + si := `col1,col2 +1,2 +2,3 +3,4 +` + fi := createTempFile(t, si) + defer os.Remove(fi.Name()) + + sa := `col1 +4 +11 +` + fa := createTempFile(t, sa) + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "exclude", + "-i", fi.Name(), + "-a", fa.Name(), + "-c", "col1", + "-o", fo.Name(), + }) + + err := rootCmd.Execute() + if err != nil { + t.Fatal("failed test\n", err) + } + + result := readString(t, fo.Name()) + + expect := joinRows( + "col1,col2", + "1,2", + "2,3", + "3,4", + ) + + if result != expect { + t.Fatal("failed test\n", result) + } +} + +func TestExcludeCmd_match_all(t *testing.T) { + + si := `col1,col2 +1,2 +2,3 +3,4 +` + fi := createTempFile(t, si) + defer os.Remove(fi.Name()) + + sa := `col1 +4 +3 +2 +1 +` + fa := createTempFile(t, sa) + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "exclude", + "-i", fi.Name(), + "-a", fa.Name(), + "-c", "col1", + "-o", fo.Name(), + }) + + err := rootCmd.Execute() + if err != nil { + t.Fatal("failed test\n", err) + } + + result := readString(t, fo.Name()) + + expect := joinRows( + "col1,col2", + ) + + if result != expect { + t.Fatal("failed test\n", result) + } +} + +func TestExcludeCmd_format(t *testing.T) { + + si := `col1 col2 +1 2 +2 3 +3 4 +` + fi := createTempFile(t, si) + defer os.Remove(fi.Name()) + + sa := `col1 col2 +2 +3 +` + fa := createTempFile(t, sa) + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "exclude", + "-i", fi.Name(), + "-a", fa.Name(), + "-c", "col1", + "-o", fo.Name(), + "--delim", `\t`, + }) + + err := rootCmd.Execute() + if err != nil { + t.Fatal("failed test\n", err) + } + + result := readString(t, fo.Name()) + + expect := joinRows( + "col1 col2", + "1 2", + ) + + if result != expect { + t.Fatal("failed test\n", result) + } +} + +func TestExcludeCmd_invalidFormat(t *testing.T) { + + fi := createTempFile(t, "") + defer os.Remove(fi.Name()) + + fa := createTempFile(t, "") + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "exclude", + "-i", fi.Name(), + "-a", fa.Name(), + "-c", "col1", + "-o", fo.Name(), + "--delim", "\t\t", + }) + + err := rootCmd.Execute() + if err == nil || err.Error() != "flag delim should be specified with a single character" { + t.Fatal("failed test\n", err) + } +} + +func TestExcludeCmd_inputColumnNotFound(t *testing.T) { + + si := `col1,col2 +1,2 +2,3 +` + fi := createTempFile(t, si) + defer os.Remove(fi.Name()) + + sa := `col1,col2 +1,x +1,y +` + fa := createTempFile(t, sa) + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "exclude", + "-i", fi.Name(), + "-a", fa.Name(), + "-c", "col3", + "-o", fo.Name(), + }) + + err := rootCmd.Execute() + if err == nil || err.Error() != "missing col3 in the input CSV file" { + t.Fatal("failed test\n", err) + } +} + +func TestExcludeCmd_anthorColumnNotFound(t *testing.T) { + + si := `col1,col2 +1,2 +2,3 +` + fi := createTempFile(t, si) + defer os.Remove(fi.Name()) + + sa := `col1,col2 +1,x +1,y +` + fa := createTempFile(t, sa) + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "exclude", + "-i", fi.Name(), + "-a", fa.Name(), + "-c", "col1", + "--column-another", "col3", + "-o", fo.Name(), + }) + + err := rootCmd.Execute() + if err == nil || err.Error() != "failed to read the another CSV file: col3 is not found" { + t.Fatal("failed test\n", err) + } +} + +func TestExcludeCmd_inputEmpty(t *testing.T) { + + fi := createTempFile(t, "") + defer os.Remove(fi.Name()) + + sa := `col1,col2 +1,x +1,y +` + fa := createTempFile(t, sa) + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "exclude", + "-i", fi.Name(), + "-a", fa.Name(), + "-c", "col1", + "-o", fo.Name(), + }) + + err := rootCmd.Execute() + if err == nil || err.Error() != "failed to read the input CSV file: EOF" { + t.Fatal("failed test\n", err) + } +} + +func TestExcludeCmd_inputFileNotFound(t *testing.T) { + + fi := createTempFile(t, "") + defer os.Remove(fi.Name()) + + fa := createTempFile(t, "") + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "exclude", + "-i", fi.Name() + "____", // 存在しないファイル + "-a", fa.Name(), + "-c", "col1", + "-o", fo.Name(), + }) + + err := rootCmd.Execute() + if err == nil { + t.Fatal("failed test\n", err) + } + + pathErr := err.(*os.PathError) + if pathErr.Path != fi.Name()+"____" || pathErr.Op != "open" { + t.Fatal("failed test\n", err) + } +} + +func TestExcludeCmd_anotherFileNotFound(t *testing.T) { + + fi := createTempFile(t, "") + defer os.Remove(fi.Name()) + + fa := createTempFile(t, "") + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "exclude", + "-i", fi.Name(), + "-a", fa.Name() + "____", // 存在しないファイル + "-c", "col1", + "-o", fo.Name(), + }) + + err := rootCmd.Execute() + if err == nil { + t.Fatal("failed test\n", err) + } + + pathErr := err.(*os.PathError) + if pathErr.Path != fa.Name()+"____" || pathErr.Op != "open" { + t.Fatal("failed test\n", err) + } +} diff --git a/cmd/include_test.go b/cmd/include_test.go index 10a1c11..19c00cb 100644 --- a/cmd/include_test.go +++ b/cmd/include_test.go @@ -150,7 +150,7 @@ func TestIncludeCmd_duplicate(t *testing.T) { } } -func TestIncludeCmd_unmatch(t *testing.T) { +func TestIncludeCmd_match_none(t *testing.T) { si := `col1,col2 1,2 @@ -195,6 +195,56 @@ func TestIncludeCmd_unmatch(t *testing.T) { } } +func TestIncludeCmd_match_all(t *testing.T) { + + si := `col1,col2 +1,2 +2,3 +3,4 +` + fi := createTempFile(t, si) + defer os.Remove(fi.Name()) + + sa := `col1 +4 +3 +2 +1 +` + fa := createTempFile(t, sa) + defer os.Remove(fa.Name()) + + fo := createTempFile(t, "") + defer os.Remove(fo.Name()) + + rootCmd := newRootCmd() + rootCmd.SetArgs([]string{ + "include", + "-i", fi.Name(), + "-a", fa.Name(), + "-c", "col1", + "-o", fo.Name(), + }) + + err := rootCmd.Execute() + if err != nil { + t.Fatal("failed test\n", err) + } + + result := readString(t, fo.Name()) + + expect := joinRows( + "col1,col2", + "1,2", + "2,3", + "3,4", + ) + + if result != expect { + t.Fatal("failed test\n", result) + } +} + func TestIncludeCmd_format(t *testing.T) { si := `col1 col2 diff --git a/cmd/root.go b/cmd/root.go index 62a822a..a48af5c 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -37,6 +37,7 @@ func newRootCmd() *cobra.Command { rootCmd.AddCommand(newReplaceCmd()) rootCmd.AddCommand(newUniqueCmd()) rootCmd.AddCommand(newIncludeCmd()) + rootCmd.AddCommand(newExcludeCmd()) for _, c := range rootCmd.Commands() { // フラグ以外は受け付けないように From 53bc0e1a6118017ee4ea5ee4e2971b81416d79a4 Mon Sep 17 00:00:00 2001 From: onozaty Date: Wed, 7 Jul 2021 00:52:27 +0900 Subject: [PATCH 4/4] =?UTF-8?q?[unique]=20=E9=87=8D=E8=A4=87=E3=83=81?= =?UTF-8?q?=E3=82=A7=E3=83=83=E3=82=AF=E3=81=ABItemSet=E3=82=92=E5=88=A9?= =?UTF-8?q?=E7=94=A8=E3=81=99=E3=82=8B=E3=82=88=E3=81=86=E3=81=AB=E5=A4=89?= =?UTF-8?q?=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmd/unique.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cmd/unique.go b/cmd/unique.go index a7adf09..ba1c2f1 100644 --- a/cmd/unique.go +++ b/cmd/unique.go @@ -99,8 +99,8 @@ func unique(reader csv.CsvReader, targetColumnNames []string, writer csv.CsvWrit return err } - // 重複チェック用のmap(valueは利用しないので一律0を入れる) - keyMap := make(map[string]int) + // 重複チェック用 + keySet := csv.NewItemSet() // ヘッダ以外 for { @@ -114,15 +114,14 @@ func unique(reader csv.CsvReader, targetColumnNames []string, writer csv.CsvWrit key := makeKey(row) - _, has := keyMap[key] - if !has { + if !keySet.Contains(key) { // 重複していない行なので書き込み err = writer.Write(row) if err != nil { return err } - keyMap[key] = 0 + keySet.Add(key) } }