From d8ff9350d4613b1d07b52da0c8e8f2fcf7fde91d Mon Sep 17 00:00:00 2001 From: schuemie Date: Wed, 9 Jan 2019 09:42:39 +0100 Subject: [PATCH] Code cleanup --- .../whiteRabbit/scan/SourceDataScan.java | 1019 ++++++++--------- 1 file changed, 481 insertions(+), 538 deletions(-) diff --git a/src/org/ohdsi/whiteRabbit/scan/SourceDataScan.java b/src/org/ohdsi/whiteRabbit/scan/SourceDataScan.java index cd08ae0a..f5dd4f1f 100644 --- a/src/org/ohdsi/whiteRabbit/scan/SourceDataScan.java +++ b/src/org/ohdsi/whiteRabbit/scan/SourceDataScan.java @@ -1,538 +1,481 @@ -/******************************************************************************* - * Copyright 2017 Observational Health Data Sciences and Informatics - * - * This file is part of WhiteRabbit - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ -package org.ohdsi.whiteRabbit.scan; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; - -import org.apache.poi.ss.usermodel.Cell; -import org.apache.poi.ss.usermodel.Row; -import org.apache.poi.ss.usermodel.Sheet; -import org.apache.poi.xssf.streaming.SXSSFWorkbook; -import org.ohdsi.databases.DbType; -import org.ohdsi.databases.RichConnection; -import org.ohdsi.databases.RichConnection.QueryResult; -import org.ohdsi.utilities.StringUtilities; -import org.ohdsi.utilities.collections.CountingSet; -import org.ohdsi.utilities.collections.CountingSet.Count; -import org.ohdsi.utilities.collections.Pair; -import org.ohdsi.utilities.files.ReadTextFile; -import org.ohdsi.whiteRabbit.DbSettings; - -public class SourceDataScan { - - public static int MAX_VALUES_IN_MEMORY = 100000; - // public static int MAX_VALUES_TO_REPORT = 25000; - public static int MIN_CELL_COUNT_FOR_CSV = 1000000; - public static int N_FOR_FREE_TEXT_CHECK = 1000; - public static int MIN_AVERAGE_LENGTH_FOR_FREE_TEXT = 100; - - private char delimiter = ','; - private int sampleSize; - private boolean scanValues; - private int minCellCount; - private int maxValues; - private DbType dbType; - private String database; - - public static void main(String[] args) { - DbSettings dbSettings = new DbSettings(); - dbSettings.dataType = DbSettings.DATABASE; - dbSettings.dbType = DbType.POSTGRESQL; - dbSettings.server = "127.0.0.1/ohdsi"; - dbSettings.database = "ars"; - dbSettings.tables.add("drugs"); - dbSettings.user = "postgres"; - dbSettings.password = ""; - SourceDataScan scan = new SourceDataScan(); - scan.process(dbSettings, 100000, true, 5, 1000, "s:/temp/ScanReport.xlsx"); - - // DbSettings dbSettings = new DbSettings(); - // dbSettings.dataType = DbSettings.DATABASE; - // dbSettings.dbType = DbType.ORACLE; - // dbSettings.server = "127.0.0.1/xe"; - // dbSettings.database = "test"; - // dbSettings.tables.add("test_table"); - // dbSettings.user = "system"; - // dbSettings.password = "F1r3starter"; - // SourceDataScan scan = new SourceDataScan(); - // scan.process(dbSettings, 1000000, "s:/data/ScanReport.xlsx"); - - // DbSettings dbSettings = new DbSettings(); - // dbSettings.dataType = DbSettings.DATABASE; - // dbSettings.dbType = DbType.MSSQL; - // dbSettings.server = "RNDUSRDHIT04"; - // dbSettings.database = "[HCUP-NIS]"; - // dbSettings.tables.add("hospital"); - // dbSettings.tables.add("severity"); - // dbSettings.tables.add("dx_pr_grps"); - // dbSettings.tables.add("core"); - // SourceDataScan scan = new SourceDataScan(); - // scan.process(dbSettings, 1000000, true, 25, "s:/data/ScanReport.xlsx"); - - // DbSettings dbSettings = new DbSettings(); - // dbSettings.dataType = DbSettings.DATABASE; - // dbSettings.dbType = DbType.MYSQL; - // dbSettings.server = "127.0.0.1"; - // dbSettings.database = "CDM_v4"; - // dbSettings.user = "root"; - // dbSettings.password = "F1r3starter"; - // dbSettings.tables.add("person"); - // dbSettings.tables.add("provider"); - // SourceDataScan scan = new SourceDataScan(); - // scan.process(dbSettings, 100000, true, 25, "c:/temp/ScanReport.xlsx"); - // - // DbSettings dbSettings = new DbSettings(); - // dbSettings.dataType = DbSettings.CSVFILES; - // dbSettings.delimiter = ','; - // dbSettings.tables.add("S:/Data/ARS/Simulation/DDRUG.csv"); - // dbSettings.tables.add("S:/Data/ARS/Simulation/HOSP.csv"); - // SourceDataScan scan = new SourceDataScan(); - // scan.process(dbSettings, 100000, false, 25, "c:/temp/ScanReport.xlsx"); - } - - public void process(DbSettings dbSettings, int sampleSize, boolean scanValues, int minCellCount, int maxValues, String filename) { - this.sampleSize = sampleSize; - this.scanValues = scanValues; - this.minCellCount = minCellCount; - this.maxValues = maxValues; - Map> tableToFieldInfos; - if (dbSettings.dataType == DbSettings.CSVFILES) { - if (!scanValues) - minCellCount = Math.max(minCellCount, MIN_CELL_COUNT_FOR_CSV); - tableToFieldInfos = processCsvFiles(dbSettings); - } else - tableToFieldInfos = processDatabase(dbSettings); - generateReport(tableToFieldInfos, filename); - } - - private Map> processDatabase(DbSettings dbSettings) { - Map> tableToFieldInfos = new HashMap>(); - RichConnection connection = new RichConnection(dbSettings.server, dbSettings.domain, dbSettings.user, dbSettings.password, dbSettings.dbType); - connection.setVerbose(false); - connection.use(dbSettings.database); - - dbType = dbSettings.dbType; - database = dbSettings.database; - - for (String table : dbSettings.tables) { - List fieldInfos = processDatabaseTable(table, connection); - tableToFieldInfos.put(table, fieldInfos); - } - - connection.close(); - return tableToFieldInfos; - } - - private Map> processCsvFiles(DbSettings dbSettings) { - delimiter = dbSettings.delimiter; - Map> tableToFieldInfos = new HashMap>(); - for (String table : dbSettings.tables) { - List fieldInfos = processCsvFile(table); - String tableName = new File(table).getName(); - if (!tableToFieldInfos.containsKey(tableName)) { - tableToFieldInfos.put(tableName, fieldInfos); - } else { - tableToFieldInfos.put(table, fieldInfos); - } - - } - return tableToFieldInfos; - } - - private void generateReport(Map> tableToFieldInfos, String filename) { - System.out.println("Generating scan report"); - removeEmptyTables(tableToFieldInfos); - List tables = new ArrayList(tableToFieldInfos.keySet()); - Collections.sort(tables); - - SXSSFWorkbook workbook = new SXSSFWorkbook(100); // keep 100 rows in memory, exceeding rows will be flushed to disk - - // Create overview sheet - Sheet sheet = workbook.createSheet("Overview"); - if (!scanValues) { - addRow(sheet, "Table", "Field", "Type", "N rows"); - for (String table : tables) { - for (FieldInfo fieldInfo : tableToFieldInfos.get(table)) - addRow(sheet, table, fieldInfo.name, fieldInfo.getTypeDescription(), Long.valueOf(fieldInfo.rowCount)); - addRow(sheet, ""); - } - } else { - addRow(sheet, "Table", "Field", "Type", "Max length", "N rows", "N rows checked", "Fraction empty"); - for (String table : tables) { - for (FieldInfo fieldInfo : tableToFieldInfos.get(table)) - addRow(sheet, table, fieldInfo.name, fieldInfo.getTypeDescription(), Integer.valueOf(fieldInfo.maxLength), Long.valueOf(fieldInfo.rowCount), - Long.valueOf(fieldInfo.nProcessed), fieldInfo.getFractionEmpty()); - addRow(sheet, ""); - } - - // Create per table sheets - for (String table : tables) { - sheet = workbook.createSheet(table); - List fieldInfos = tableToFieldInfos.get(table); - List>> valueCounts = new ArrayList>>(); - Object[] header = new Object[fieldInfos.size() * 2]; - int maxCount = 0; - for (int i = 0; i < fieldInfos.size(); i++) { - FieldInfo fieldInfo = fieldInfos.get(i); - header[i * 2] = fieldInfo.name; - if (fieldInfo.isFreeText) - header[(i * 2) + 1] = "Word count"; - else - header[(i * 2) + 1] = "Frequency"; - List> counts = fieldInfo.getSortedValuesWithoutSmallValues(); - valueCounts.add(counts); - if (counts.size() > maxCount) - maxCount = counts.size(); - } - addRow(sheet, header); - for (int i = 0; i < maxCount; i++) { - Object[] row = new Object[fieldInfos.size() * 2]; - for (int j = 0; j < fieldInfos.size(); j++) { - List> counts = valueCounts.get(j); - if (counts.size() > i) { - row[j * 2] = counts.get(i).getItem1(); - row[(j * 2) + 1] = counts.get(i).getItem2() == -1 ? "" : counts.get(i).getItem2(); - } else { - row[j * 2] = ""; - row[(j * 2) + 1] = ""; - } - } - addRow(sheet, row); - } - // Save some memory by derefencing tables already included in the report: - tableToFieldInfos.remove(table); - } - } - - try { - FileOutputStream out = new FileOutputStream(new File(filename)); - workbook.write(out); - out.close(); - StringUtilities.outputWithTime("Scan report generated: " + filename); - } catch (IOException e) { - throw new RuntimeException(e.getMessage()); - } - } - - private void removeEmptyTables(Map> tableToFieldInfos) { - Iterator>> iterator = tableToFieldInfos.entrySet().iterator(); - while (iterator.hasNext()) { - if (iterator.next().getValue().size() == 0) - iterator.remove(); - } - } - - private List processDatabaseTable(String table, RichConnection connection) { - StringUtilities.outputWithTime("Scanning table " + table); - - long rowCount = connection.getTableSize(table); - List fieldInfos = fetchTableStructure(connection, table); - if (scanValues) { - int actualCount = 0; - QueryResult queryResult = null; - try { - queryResult = fetchRowsFromTable(connection, table, rowCount); - for (org.ohdsi.utilities.files.Row row : queryResult) { - for (int i = 0; i < fieldInfos.size(); i++) - fieldInfos.get(i).processValue(row.get(fieldInfos.get(i).name)); - actualCount++; - if (sampleSize != -1 && actualCount >= sampleSize) { - System.out.println("Stopped after " + actualCount + " rows"); - break; - } - } - for (FieldInfo fieldInfo : fieldInfos) - fieldInfo.trim(); - } catch (Exception e) { - System.out.println("Error: " + e.getMessage()); - } finally { - if (queryResult != null) { - queryResult.close(); - } - } - } - - return fieldInfos; - } - - private QueryResult fetchRowsFromTable(RichConnection connection, String table, long rowCount) { - String query = null; - - if (sampleSize == -1) { - if (dbType == DbType.MSSQL || dbType == DbType.PDW || dbType == DbType.MSACCESS) - query = "SELECT * FROM [" + table + "]"; - else - query = "SELECT * FROM " + table; - } else { - if (dbType == DbType.MSSQL) - query = "SELECT * FROM [" + table + "] TABLESAMPLE (" + sampleSize + " ROWS)"; - else if (dbType == DbType.MYSQL) - query = "SELECT * FROM " + table + " ORDER BY RAND() LIMIT " + sampleSize; - else if (dbType == DbType.PDW) - query = "SELECT TOP " + sampleSize + " * FROM [" + table + "] ORDER BY RAND()"; - else if (dbType == DbType.ORACLE) { - if (sampleSize < rowCount) { - double percentage = 100 * sampleSize / (double) rowCount; - if (percentage < 100) - query = "SELECT * FROM " + table + " SAMPLE(" + percentage + ")"; - } else { - query = "SELECT * FROM " + table; - } - } else if (dbType == DbType.POSTGRESQL || dbType == DbType.REDSHIFT) - query = "SELECT * FROM " + table + " ORDER BY RANDOM() LIMIT " + sampleSize; - else if (dbType == DbType.MSACCESS) - query = "SELECT " + "TOP " + sampleSize + " * FROM [" + table + "]"; - } - // System.out.println("SQL: " + query); - return connection.query(query); - - } - - private List fetchTableStructure(RichConnection connection, String table) { - List fieldInfos = new ArrayList(); - - if (dbType == DbType.MSACCESS) { - ResultSet rs = connection.getMsAccessFieldNames(table); - try { - while (rs.next()) { - FieldInfo fieldInfo = new FieldInfo(rs.getString("COLUMN_NAME")); - fieldInfo.type = rs.getString("TYPE_NAME"); - fieldInfo.rowCount = connection.getTableSize(table); - fieldInfos.add(fieldInfo); - } - } catch (SQLException e) { - throw new RuntimeException(e.getMessage()); - } - } else { - String query = null; - if (dbType == DbType.ORACLE) - query = "SELECT COLUMN_NAME,DATA_TYPE FROM ALL_TAB_COLUMNS WHERE table_name = '" + table + "' AND owner = '" + database.toUpperCase() + "'"; - else if (dbType == DbType.MSSQL || dbType == DbType.PDW) { - String trimmedDatabase = database; - if (database.startsWith("[") && database.endsWith("]")) - trimmedDatabase = database.substring(1, database.length() - 1); - query = "SELECT COLUMN_NAME,DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_CATALOG='" + trimmedDatabase + "' AND TABLE_NAME='" + table - + "';"; - } else if (dbType == DbType.MYSQL) - query = "SELECT COLUMN_NAME,DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '" + database + "' AND TABLE_NAME = '" + table - + "';"; - else if (dbType == DbType.POSTGRESQL || dbType == DbType.REDSHIFT) - query = "SELECT COLUMN_NAME,DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '" + database.toLowerCase() + "' AND TABLE_NAME = '" - + table.toLowerCase() + "' ORDER BY ordinal_position;"; - else if (dbType == DbType.TERADATA) { - query = "SELECT ColumnName, ColumnType FROM dbc.columns WHERE DatabaseName= '" + database.toLowerCase() + "' AND TableName = '" - + table.toLowerCase() + "';"; - } - - for (org.ohdsi.utilities.files.Row row : connection.query(query)) { - row.upperCaseFieldNames(); - FieldInfo fieldInfo; - if (dbType == DbType.TERADATA) { - fieldInfo = new FieldInfo(row.get("COLUMNNAME")); - } else { - fieldInfo = new FieldInfo(row.get("COLUMN_NAME")); - } - if (dbType == DbType.TERADATA) { - fieldInfo.type = row.get("COLUMNTYPE"); - } else { - fieldInfo.type = row.get("DATA_TYPE"); - } - fieldInfo.rowCount = connection.getTableSize(table); - ; - fieldInfos.add(fieldInfo); - } - } - return fieldInfos; - } - - private List processCsvFile(String filename) { - StringUtilities.outputWithTime("Scanning table " + filename); - List fieldInfos = new ArrayList(); - int lineNr = 0; - for (String line : new ReadTextFile(filename)) { - lineNr++; - List row = StringUtilities.safeSplit(line, delimiter); - for (int i = 0; i < row.size(); i++) { - String column = row.get(i); - if (column.startsWith("\"") && column.endsWith("\"") && column.length() > 1) - column = column.substring(1, column.length() - 1); - column = column.replace("\\\"", "\""); - row.set(i, column); - } - if (lineNr == 1) { - for (String cell : row) - fieldInfos.add(new FieldInfo(cell)); - } else { - if (row.size() == fieldInfos.size()) { // Else there appears to be a formatting error, so skip - for (int i = 0; i < row.size(); i++) - fieldInfos.get(i).processValue(row.get(i)); - } - } - if (sampleSize != -1 && lineNr == sampleSize) - break; - } - for (FieldInfo fieldInfo : fieldInfos) - fieldInfo.trim(); - - return fieldInfos; - } - - private class FieldInfo { - public String type; - public String name; - public CountingSet valueCounts = new CountingSet(); - public long sumLength = 0; - public int maxLength = 0; - public long nProcessed = 0; - public long emptyCount = 0; - public long rowCount = -1; - public boolean isInteger = true; - public boolean isReal = true; - public boolean isDate = true; - public boolean isFreeText = false; - public boolean tooManyValues = false; - - public FieldInfo(String name) { - this.name = name; - } - - public void trim() { - if (valueCounts.size() > maxValues) - valueCounts.keepTopN(maxValues); - } - - public Double getFractionEmpty() { - if (nProcessed == 0) - return 0d; - else - return emptyCount / (double) nProcessed; - } - - public String getTypeDescription() { - if (type != null) - return type; - else if (nProcessed == emptyCount) - return "empty"; - else if (isFreeText) - return "text"; - else if (isDate) - return "date"; - else if (isInteger) - return "int"; - else if (isReal) - return "real"; - else - return "varchar"; - } - - public void processValue(String value) { - String trimValue = value.trim(); - nProcessed++; - sumLength += value.length(); - if (value.length() > maxLength) - maxLength = value.length(); - - if (trimValue.length() == 0) - emptyCount++; - - if (!isFreeText) { - valueCounts.add(value); - - if (trimValue.length() != 0) { - if (isReal && !StringUtilities.isNumber(trimValue)) - isReal = false; - if (isInteger && !StringUtilities.isLong(trimValue)) - isInteger = false; - if (isDate && !StringUtilities.isDate(trimValue)) - isDate = false; - } - if (nProcessed == N_FOR_FREE_TEXT_CHECK) { - if (!isInteger && !isReal && !isDate) { - double averageLength = sumLength / (double) (nProcessed - emptyCount); - if (averageLength >= MIN_AVERAGE_LENGTH_FOR_FREE_TEXT) { - isFreeText = true; - CountingSet wordCounts = new CountingSet(); - for (Map.Entry entry : valueCounts.key2count.entrySet()) - for (String word : StringUtilities.mapToWords(entry.getKey().toLowerCase())) - wordCounts.add(word, entry.getValue().count); - valueCounts = wordCounts; - } - } - } - } else { - for (String word : StringUtilities.mapToWords(trimValue.toLowerCase())) - valueCounts.add(word); - } - - if (!tooManyValues && valueCounts.size() > MAX_VALUES_IN_MEMORY) { - tooManyValues = true; - valueCounts.keepTopN(maxValues); - } - } - - public List> getSortedValuesWithoutSmallValues() { - boolean truncated = false; - List> result = new ArrayList>(); - - for (Map.Entry entry : valueCounts.key2count.entrySet()) { - if (entry.getValue().count < minCellCount) - truncated = true; - else { - result.add(new Pair(entry.getKey(), entry.getValue().count)); - if (result.size() > maxValues) { - truncated = true; - break; - } - } - } - - Collections.sort(result, new Comparator>() { - public int compare(Pair o1, Pair o2) { - return o2.getItem2().compareTo(o1.getItem2()); - } - }); - if (truncated) - result.add(new Pair("List truncated...", -1)); - return result; - } - } - - private void addRow(Sheet sheet, Object... values) { - Row row = sheet.createRow(sheet.getPhysicalNumberOfRows()); - for (Object value : values) { - Cell cell = row.createCell(row.getPhysicalNumberOfCells()); - - if (value instanceof Integer || value instanceof Long || value instanceof Double) - cell.setCellValue(Double.parseDouble(value.toString())); - else - cell.setCellValue(value.toString()); - - } - } -} +/******************************************************************************* + * Copyright 2017 Observational Health Data Sciences and Informatics + * + * This file is part of WhiteRabbit + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ +package org.ohdsi.whiteRabbit.scan; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.apache.poi.ss.usermodel.Cell; +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.xssf.streaming.SXSSFWorkbook; +import org.ohdsi.databases.DbType; +import org.ohdsi.databases.RichConnection; +import org.ohdsi.databases.RichConnection.QueryResult; +import org.ohdsi.utilities.StringUtilities; +import org.ohdsi.utilities.collections.CountingSet; +import org.ohdsi.utilities.collections.CountingSet.Count; +import org.ohdsi.utilities.collections.Pair; +import org.ohdsi.utilities.files.ReadTextFile; +import org.ohdsi.whiteRabbit.DbSettings; + +public class SourceDataScan { + + public static int MAX_VALUES_IN_MEMORY = 100000; + public static int MIN_CELL_COUNT_FOR_CSV = 1000000; + public static int N_FOR_FREE_TEXT_CHECK = 1000; + public static int MIN_AVERAGE_LENGTH_FOR_FREE_TEXT = 100; + + private char delimiter = ','; + private int sampleSize; + private boolean scanValues; + private int minCellCount; + private int maxValues; + private DbType dbType; + private String database; + + public void process(DbSettings dbSettings, int sampleSize, boolean scanValues, int minCellCount, int maxValues, String filename) { + this.sampleSize = sampleSize; + this.scanValues = scanValues; + this.minCellCount = minCellCount; + this.maxValues = maxValues; + Map> tableToFieldInfos; + if (dbSettings.dataType == DbSettings.CSVFILES) { + if (!scanValues) + minCellCount = Math.max(minCellCount, MIN_CELL_COUNT_FOR_CSV); + tableToFieldInfos = processCsvFiles(dbSettings); + } else + tableToFieldInfos = processDatabase(dbSettings); + generateReport(tableToFieldInfos, filename); + } + + private Map> processDatabase(DbSettings dbSettings) { + Map> tableToFieldInfos = new HashMap>(); + RichConnection connection = new RichConnection(dbSettings.server, dbSettings.domain, dbSettings.user, dbSettings.password, dbSettings.dbType); + connection.setVerbose(false); + connection.use(dbSettings.database); + + dbType = dbSettings.dbType; + database = dbSettings.database; + + for (String table : dbSettings.tables) { + List fieldInfos = processDatabaseTable(table, connection); + tableToFieldInfos.put(table, fieldInfos); + } + + connection.close(); + return tableToFieldInfos; + } + + private Map> processCsvFiles(DbSettings dbSettings) { + delimiter = dbSettings.delimiter; + Map> tableToFieldInfos = new HashMap>(); + for (String table : dbSettings.tables) { + List fieldInfos = processCsvFile(table); + String tableName = new File(table).getName(); + if (!tableToFieldInfos.containsKey(tableName)) { + tableToFieldInfos.put(tableName, fieldInfos); + } else { + tableToFieldInfos.put(table, fieldInfos); + } + + } + return tableToFieldInfos; + } + + private void generateReport(Map> tableToFieldInfos, String filename) { + System.out.println("Generating scan report"); + removeEmptyTables(tableToFieldInfos); + List tables = new ArrayList(tableToFieldInfos.keySet()); + Collections.sort(tables); + + SXSSFWorkbook workbook = new SXSSFWorkbook(100); // keep 100 rows in memory, exceeding rows will be flushed to disk + + // Create overview sheet + Sheet sheet = workbook.createSheet("Overview"); + if (!scanValues) { + addRow(sheet, "Table", "Field", "Type", "N rows"); + for (String table : tables) { + for (FieldInfo fieldInfo : tableToFieldInfos.get(table)) + addRow(sheet, table, fieldInfo.name, fieldInfo.getTypeDescription(), Long.valueOf(fieldInfo.rowCount)); + addRow(sheet, ""); + } + } else { + addRow(sheet, "Table", "Field", "Type", "Max length", "N rows", "N rows checked", "Fraction empty"); + for (String table : tables) { + for (FieldInfo fieldInfo : tableToFieldInfos.get(table)) + addRow(sheet, table, fieldInfo.name, fieldInfo.getTypeDescription(), Integer.valueOf(fieldInfo.maxLength), Long.valueOf(fieldInfo.rowCount), + Long.valueOf(fieldInfo.nProcessed), fieldInfo.getFractionEmpty()); + addRow(sheet, ""); + } + + // Create per table sheets + for (String table : tables) { + sheet = workbook.createSheet(table); + List fieldInfos = tableToFieldInfos.get(table); + List>> valueCounts = new ArrayList>>(); + Object[] header = new Object[fieldInfos.size() * 2]; + int maxCount = 0; + for (int i = 0; i < fieldInfos.size(); i++) { + FieldInfo fieldInfo = fieldInfos.get(i); + header[i * 2] = fieldInfo.name; + if (fieldInfo.isFreeText) + header[(i * 2) + 1] = "Word count"; + else + header[(i * 2) + 1] = "Frequency"; + List> counts = fieldInfo.getSortedValuesWithoutSmallValues(); + valueCounts.add(counts); + if (counts.size() > maxCount) + maxCount = counts.size(); + } + addRow(sheet, header); + for (int i = 0; i < maxCount; i++) { + Object[] row = new Object[fieldInfos.size() * 2]; + for (int j = 0; j < fieldInfos.size(); j++) { + List> counts = valueCounts.get(j); + if (counts.size() > i) { + row[j * 2] = counts.get(i).getItem1(); + row[(j * 2) + 1] = counts.get(i).getItem2() == -1 ? "" : counts.get(i).getItem2(); + } else { + row[j * 2] = ""; + row[(j * 2) + 1] = ""; + } + } + addRow(sheet, row); + } + // Save some memory by derefencing tables already included in the report: + tableToFieldInfos.remove(table); + } + } + + try { + FileOutputStream out = new FileOutputStream(new File(filename)); + workbook.write(out); + out.close(); + StringUtilities.outputWithTime("Scan report generated: " + filename); + } catch (IOException e) { + throw new RuntimeException(e.getMessage()); + } + } + + private void removeEmptyTables(Map> tableToFieldInfos) { + Iterator>> iterator = tableToFieldInfos.entrySet().iterator(); + while (iterator.hasNext()) { + if (iterator.next().getValue().size() == 0) + iterator.remove(); + } + } + + private List processDatabaseTable(String table, RichConnection connection) { + StringUtilities.outputWithTime("Scanning table " + table); + + long rowCount = connection.getTableSize(table); + List fieldInfos = fetchTableStructure(connection, table); + if (scanValues) { + int actualCount = 0; + QueryResult queryResult = null; + try { + queryResult = fetchRowsFromTable(connection, table, rowCount); + for (org.ohdsi.utilities.files.Row row : queryResult) { + for (int i = 0; i < fieldInfos.size(); i++) + fieldInfos.get(i).processValue(row.get(fieldInfos.get(i).name)); + actualCount++; + if (sampleSize != -1 && actualCount >= sampleSize) { + System.out.println("Stopped after " + actualCount + " rows"); + break; + } + } + for (FieldInfo fieldInfo : fieldInfos) + fieldInfo.trim(); + } catch (Exception e) { + System.out.println("Error: " + e.getMessage()); + } finally { + if (queryResult != null) { + queryResult.close(); + } + } + } + + return fieldInfos; + } + + private QueryResult fetchRowsFromTable(RichConnection connection, String table, long rowCount) { + String query = null; + + if (sampleSize == -1) { + if (dbType == DbType.MSSQL || dbType == DbType.PDW || dbType == DbType.MSACCESS) + query = "SELECT * FROM [" + table + "]"; + else + query = "SELECT * FROM " + table; + } else { + if (dbType == DbType.MSSQL) + query = "SELECT * FROM [" + table + "] TABLESAMPLE (" + sampleSize + " ROWS)"; + else if (dbType == DbType.MYSQL) + query = "SELECT * FROM " + table + " ORDER BY RAND() LIMIT " + sampleSize; + else if (dbType == DbType.PDW) + query = "SELECT TOP " + sampleSize + " * FROM [" + table + "] ORDER BY RAND()"; + else if (dbType == DbType.ORACLE) { + if (sampleSize < rowCount) { + double percentage = 100 * sampleSize / (double) rowCount; + if (percentage < 100) + query = "SELECT * FROM " + table + " SAMPLE(" + percentage + ")"; + } else { + query = "SELECT * FROM " + table; + } + } else if (dbType == DbType.POSTGRESQL || dbType == DbType.REDSHIFT) + query = "SELECT * FROM " + table + " ORDER BY RANDOM() LIMIT " + sampleSize; + else if (dbType == DbType.MSACCESS) + query = "SELECT " + "TOP " + sampleSize + " * FROM [" + table + "]"; + } + // System.out.println("SQL: " + query); + return connection.query(query); + + } + + private List fetchTableStructure(RichConnection connection, String table) { + List fieldInfos = new ArrayList(); + + if (dbType == DbType.MSACCESS) { + ResultSet rs = connection.getMsAccessFieldNames(table); + try { + while (rs.next()) { + FieldInfo fieldInfo = new FieldInfo(rs.getString("COLUMN_NAME")); + fieldInfo.type = rs.getString("TYPE_NAME"); + fieldInfo.rowCount = connection.getTableSize(table); + fieldInfos.add(fieldInfo); + } + } catch (SQLException e) { + throw new RuntimeException(e.getMessage()); + } + } else { + String query = null; + if (dbType == DbType.ORACLE) + query = "SELECT COLUMN_NAME,DATA_TYPE FROM ALL_TAB_COLUMNS WHERE table_name = '" + table + "' AND owner = '" + database.toUpperCase() + "'"; + else if (dbType == DbType.MSSQL || dbType == DbType.PDW) { + String trimmedDatabase = database; + if (database.startsWith("[") && database.endsWith("]")) + trimmedDatabase = database.substring(1, database.length() - 1); + query = "SELECT COLUMN_NAME,DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_CATALOG='" + trimmedDatabase + "' AND TABLE_NAME='" + table + + "';"; + } else if (dbType == DbType.MYSQL) + query = "SELECT COLUMN_NAME,DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '" + database + "' AND TABLE_NAME = '" + table + + "';"; + else if (dbType == DbType.POSTGRESQL || dbType == DbType.REDSHIFT) + query = "SELECT COLUMN_NAME,DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '" + database.toLowerCase() + "' AND TABLE_NAME = '" + + table.toLowerCase() + "' ORDER BY ordinal_position;"; + else if (dbType == DbType.TERADATA) { + query = "SELECT ColumnName, ColumnType FROM dbc.columns WHERE DatabaseName= '" + database.toLowerCase() + "' AND TableName = '" + + table.toLowerCase() + "';"; + } + + for (org.ohdsi.utilities.files.Row row : connection.query(query)) { + row.upperCaseFieldNames(); + FieldInfo fieldInfo; + if (dbType == DbType.TERADATA) { + fieldInfo = new FieldInfo(row.get("COLUMNNAME")); + } else { + fieldInfo = new FieldInfo(row.get("COLUMN_NAME")); + } + if (dbType == DbType.TERADATA) { + fieldInfo.type = row.get("COLUMNTYPE"); + } else { + fieldInfo.type = row.get("DATA_TYPE"); + } + fieldInfo.rowCount = connection.getTableSize(table); + ; + fieldInfos.add(fieldInfo); + } + } + return fieldInfos; + } + + private List processCsvFile(String filename) { + StringUtilities.outputWithTime("Scanning table " + filename); + List fieldInfos = new ArrayList(); + int lineNr = 0; + for (String line : new ReadTextFile(filename)) { + lineNr++; + List row = StringUtilities.safeSplit(line, delimiter); + for (int i = 0; i < row.size(); i++) { + String column = row.get(i); + if (column.startsWith("\"") && column.endsWith("\"") && column.length() > 1) + column = column.substring(1, column.length() - 1); + column = column.replace("\\\"", "\""); + row.set(i, column); + } + if (lineNr == 1) { + for (String cell : row) + fieldInfos.add(new FieldInfo(cell)); + } else { + if (row.size() == fieldInfos.size()) { // Else there appears to be a formatting error, so skip + for (int i = 0; i < row.size(); i++) + fieldInfos.get(i).processValue(row.get(i)); + } + } + if (sampleSize != -1 && lineNr == sampleSize) + break; + } + for (FieldInfo fieldInfo : fieldInfos) + fieldInfo.trim(); + + return fieldInfos; + } + + private class FieldInfo { + public String type; + public String name; + public CountingSet valueCounts = new CountingSet(); + public long sumLength = 0; + public int maxLength = 0; + public long nProcessed = 0; + public long emptyCount = 0; + public long rowCount = -1; + public boolean isInteger = true; + public boolean isReal = true; + public boolean isDate = true; + public boolean isFreeText = false; + public boolean tooManyValues = false; + + public FieldInfo(String name) { + this.name = name; + } + + public void trim() { + if (valueCounts.size() > maxValues) + valueCounts.keepTopN(maxValues); + } + + public Double getFractionEmpty() { + if (nProcessed == 0) + return 0d; + else + return emptyCount / (double) nProcessed; + } + + public String getTypeDescription() { + if (type != null) + return type; + else if (nProcessed == emptyCount) + return "empty"; + else if (isFreeText) + return "text"; + else if (isDate) + return "date"; + else if (isInteger) + return "int"; + else if (isReal) + return "real"; + else + return "varchar"; + } + + public void processValue(String value) { + String trimValue = value.trim(); + nProcessed++; + sumLength += value.length(); + if (value.length() > maxLength) + maxLength = value.length(); + + if (trimValue.length() == 0) + emptyCount++; + + if (!isFreeText) { + valueCounts.add(value); + + if (trimValue.length() != 0) { + if (isReal && !StringUtilities.isNumber(trimValue)) + isReal = false; + if (isInteger && !StringUtilities.isLong(trimValue)) + isInteger = false; + if (isDate && !StringUtilities.isDate(trimValue)) + isDate = false; + } + if (nProcessed == N_FOR_FREE_TEXT_CHECK) { + if (!isInteger && !isReal && !isDate) { + double averageLength = sumLength / (double) (nProcessed - emptyCount); + if (averageLength >= MIN_AVERAGE_LENGTH_FOR_FREE_TEXT) { + isFreeText = true; + CountingSet wordCounts = new CountingSet(); + for (Map.Entry entry : valueCounts.key2count.entrySet()) + for (String word : StringUtilities.mapToWords(entry.getKey().toLowerCase())) + wordCounts.add(word, entry.getValue().count); + valueCounts = wordCounts; + } + } + } + } else { + for (String word : StringUtilities.mapToWords(trimValue.toLowerCase())) + valueCounts.add(word); + } + + if (!tooManyValues && valueCounts.size() > MAX_VALUES_IN_MEMORY) { + tooManyValues = true; + valueCounts.keepTopN(maxValues); + } + } + + public List> getSortedValuesWithoutSmallValues() { + boolean truncated = false; + List> result = new ArrayList>(); + + for (Map.Entry entry : valueCounts.key2count.entrySet()) { + if (entry.getValue().count < minCellCount) + truncated = true; + else { + result.add(new Pair(entry.getKey(), entry.getValue().count)); + if (result.size() > maxValues) { + truncated = true; + break; + } + } + } + + Collections.sort(result, new Comparator>() { + public int compare(Pair o1, Pair o2) { + return o2.getItem2().compareTo(o1.getItem2()); + } + }); + if (truncated) + result.add(new Pair("List truncated...", -1)); + return result; + } + } + + private void addRow(Sheet sheet, Object... values) { + Row row = sheet.createRow(sheet.getPhysicalNumberOfRows()); + for (Object value : values) { + Cell cell = row.createCell(row.getPhysicalNumberOfCells()); + + if (value instanceof Integer || value instanceof Long || value instanceof Double) + cell.setCellValue(Double.parseDouble(value.toString())); + else + cell.setCellValue(value.toString()); + + } + } +}