From d8ff9350d4613b1d07b52da0c8e8f2fcf7fde91d Mon Sep 17 00:00:00 2001
From: schuemie <schuemie@ohdsi.org>
Date: Wed, 9 Jan 2019 09:42:39 +0100
Subject: [PATCH] Code cleanup

---
 .../whiteRabbit/scan/SourceDataScan.java      | 1019 ++++++++---------
 1 file changed, 481 insertions(+), 538 deletions(-)

diff --git a/src/org/ohdsi/whiteRabbit/scan/SourceDataScan.java b/src/org/ohdsi/whiteRabbit/scan/SourceDataScan.java
index cd08ae0a..f5dd4f1f 100644
--- a/src/org/ohdsi/whiteRabbit/scan/SourceDataScan.java
+++ b/src/org/ohdsi/whiteRabbit/scan/SourceDataScan.java
@@ -1,538 +1,481 @@
-/*******************************************************************************
- * Copyright 2017 Observational Health Data Sciences and Informatics
- * 
- * This file is part of WhiteRabbit
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- ******************************************************************************/
-package org.ohdsi.whiteRabbit.scan;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.poi.ss.usermodel.Cell;
-import org.apache.poi.ss.usermodel.Row;
-import org.apache.poi.ss.usermodel.Sheet;
-import org.apache.poi.xssf.streaming.SXSSFWorkbook;
-import org.ohdsi.databases.DbType;
-import org.ohdsi.databases.RichConnection;
-import org.ohdsi.databases.RichConnection.QueryResult;
-import org.ohdsi.utilities.StringUtilities;
-import org.ohdsi.utilities.collections.CountingSet;
-import org.ohdsi.utilities.collections.CountingSet.Count;
-import org.ohdsi.utilities.collections.Pair;
-import org.ohdsi.utilities.files.ReadTextFile;
-import org.ohdsi.whiteRabbit.DbSettings;
-
-public class SourceDataScan {
-
-	public static int	MAX_VALUES_IN_MEMORY				= 100000;
-	// public static int MAX_VALUES_TO_REPORT = 25000;
-	public static int	MIN_CELL_COUNT_FOR_CSV				= 1000000;
-	public static int	N_FOR_FREE_TEXT_CHECK				= 1000;
-	public static int	MIN_AVERAGE_LENGTH_FOR_FREE_TEXT	= 100;
-
-	private char		delimiter							= ',';
-	private int			sampleSize;
-	private boolean		scanValues;
-	private int			minCellCount;
-	private int			maxValues;
-	private DbType		dbType;
-	private String		database;
-
-	public static void main(String[] args) {
-		DbSettings dbSettings = new DbSettings();
-		dbSettings.dataType = DbSettings.DATABASE;
-		dbSettings.dbType = DbType.POSTGRESQL;
-		dbSettings.server = "127.0.0.1/ohdsi";
-		dbSettings.database = "ars";
-		dbSettings.tables.add("drugs");
-		dbSettings.user = "postgres";
-		dbSettings.password = "";
-		SourceDataScan scan = new SourceDataScan();
-		scan.process(dbSettings, 100000, true, 5, 1000, "s:/temp/ScanReport.xlsx");
-
-		// DbSettings dbSettings = new DbSettings();
-		// dbSettings.dataType = DbSettings.DATABASE;
-		// dbSettings.dbType = DbType.ORACLE;
-		// dbSettings.server = "127.0.0.1/xe";
-		// dbSettings.database = "test";
-		// dbSettings.tables.add("test_table");
-		// dbSettings.user = "system";
-		// dbSettings.password = "F1r3starter";
-		// SourceDataScan scan = new SourceDataScan();
-		// scan.process(dbSettings, 1000000, "s:/data/ScanReport.xlsx");
-
-		// DbSettings dbSettings = new DbSettings();
-		// dbSettings.dataType = DbSettings.DATABASE;
-		// dbSettings.dbType = DbType.MSSQL;
-		// dbSettings.server = "RNDUSRDHIT04";
-		// dbSettings.database = "[HCUP-NIS]";
-		// dbSettings.tables.add("hospital");
-		// dbSettings.tables.add("severity");
-		// dbSettings.tables.add("dx_pr_grps");
-		// dbSettings.tables.add("core");
-		// SourceDataScan scan = new SourceDataScan();
-		// scan.process(dbSettings, 1000000, true, 25, "s:/data/ScanReport.xlsx");
-
-		// DbSettings dbSettings = new DbSettings();
-		// dbSettings.dataType = DbSettings.DATABASE;
-		// dbSettings.dbType = DbType.MYSQL;
-		// dbSettings.server = "127.0.0.1";
-		// dbSettings.database = "CDM_v4";
-		// dbSettings.user = "root";
-		// dbSettings.password = "F1r3starter";
-		// dbSettings.tables.add("person");
-		// dbSettings.tables.add("provider");
-		// SourceDataScan scan = new SourceDataScan();
-		// scan.process(dbSettings, 100000, true, 25, "c:/temp/ScanReport.xlsx");
-		//
-		// DbSettings dbSettings = new DbSettings();
-		// dbSettings.dataType = DbSettings.CSVFILES;
-		// dbSettings.delimiter = ',';
-		// dbSettings.tables.add("S:/Data/ARS/Simulation/DDRUG.csv");
-		// dbSettings.tables.add("S:/Data/ARS/Simulation/HOSP.csv");
-		// SourceDataScan scan = new SourceDataScan();
-		// scan.process(dbSettings, 100000, false, 25, "c:/temp/ScanReport.xlsx");
-	}
-
-	public void process(DbSettings dbSettings, int sampleSize, boolean scanValues, int minCellCount, int maxValues, String filename) {
-		this.sampleSize = sampleSize;
-		this.scanValues = scanValues;
-		this.minCellCount = minCellCount;
-		this.maxValues = maxValues;
-		Map<String, List<FieldInfo>> tableToFieldInfos;
-		if (dbSettings.dataType == DbSettings.CSVFILES) {
-			if (!scanValues)
-				minCellCount = Math.max(minCellCount, MIN_CELL_COUNT_FOR_CSV);
-			tableToFieldInfos = processCsvFiles(dbSettings);
-		} else
-			tableToFieldInfos = processDatabase(dbSettings);
-		generateReport(tableToFieldInfos, filename);
-	}
-
-	private Map<String, List<FieldInfo>> processDatabase(DbSettings dbSettings) {
-		Map<String, List<FieldInfo>> tableToFieldInfos = new HashMap<String, List<FieldInfo>>();
-		RichConnection connection = new RichConnection(dbSettings.server, dbSettings.domain, dbSettings.user, dbSettings.password, dbSettings.dbType);
-		connection.setVerbose(false);
-		connection.use(dbSettings.database);
-
-		dbType = dbSettings.dbType;
-		database = dbSettings.database;
-
-		for (String table : dbSettings.tables) {
-			List<FieldInfo> fieldInfos = processDatabaseTable(table, connection);
-			tableToFieldInfos.put(table, fieldInfos);
-		}
-
-		connection.close();
-		return tableToFieldInfos;
-	}
-
-	private Map<String, List<FieldInfo>> processCsvFiles(DbSettings dbSettings) {
-		delimiter = dbSettings.delimiter;
-		Map<String, List<FieldInfo>> tableToFieldInfos = new HashMap<String, List<FieldInfo>>();
-		for (String table : dbSettings.tables) {
-			List<FieldInfo> fieldInfos = processCsvFile(table);
-			String tableName = new File(table).getName();
-			if (!tableToFieldInfos.containsKey(tableName)) {
-				tableToFieldInfos.put(tableName, fieldInfos);
-			} else {
-				tableToFieldInfos.put(table, fieldInfos);
-			}
-
-		}
-		return tableToFieldInfos;
-	}
-
-	private void generateReport(Map<String, List<FieldInfo>> tableToFieldInfos, String filename) {
-		System.out.println("Generating scan report");
-		removeEmptyTables(tableToFieldInfos);
-		List<String> tables = new ArrayList<String>(tableToFieldInfos.keySet());
-		Collections.sort(tables);
-
-		SXSSFWorkbook workbook = new SXSSFWorkbook(100); // keep 100 rows in memory, exceeding rows will be flushed to disk
-
-		// Create overview sheet
-		Sheet sheet = workbook.createSheet("Overview");
-		if (!scanValues) {
-			addRow(sheet, "Table", "Field", "Type", "N rows");
-			for (String table : tables) {
-				for (FieldInfo fieldInfo : tableToFieldInfos.get(table))
-					addRow(sheet, table, fieldInfo.name, fieldInfo.getTypeDescription(), Long.valueOf(fieldInfo.rowCount));
-				addRow(sheet, "");
-			}
-		} else {
-			addRow(sheet, "Table", "Field", "Type", "Max length", "N rows", "N rows checked", "Fraction empty");
-			for (String table : tables) {
-				for (FieldInfo fieldInfo : tableToFieldInfos.get(table))
-					addRow(sheet, table, fieldInfo.name, fieldInfo.getTypeDescription(), Integer.valueOf(fieldInfo.maxLength), Long.valueOf(fieldInfo.rowCount),
-							Long.valueOf(fieldInfo.nProcessed), fieldInfo.getFractionEmpty());
-				addRow(sheet, "");
-			}
-
-			// Create per table sheets
-			for (String table : tables) {
-				sheet = workbook.createSheet(table);
-				List<FieldInfo> fieldInfos = tableToFieldInfos.get(table);
-				List<List<Pair<String, Integer>>> valueCounts = new ArrayList<List<Pair<String, Integer>>>();
-				Object[] header = new Object[fieldInfos.size() * 2];
-				int maxCount = 0;
-				for (int i = 0; i < fieldInfos.size(); i++) {
-					FieldInfo fieldInfo = fieldInfos.get(i);
-					header[i * 2] = fieldInfo.name;
-					if (fieldInfo.isFreeText)
-						header[(i * 2) + 1] = "Word count";
-					else
-						header[(i * 2) + 1] = "Frequency";
-					List<Pair<String, Integer>> counts = fieldInfo.getSortedValuesWithoutSmallValues();
-					valueCounts.add(counts);
-					if (counts.size() > maxCount)
-						maxCount = counts.size();
-				}
-				addRow(sheet, header);
-				for (int i = 0; i < maxCount; i++) {
-					Object[] row = new Object[fieldInfos.size() * 2];
-					for (int j = 0; j < fieldInfos.size(); j++) {
-						List<Pair<String, Integer>> counts = valueCounts.get(j);
-						if (counts.size() > i) {
-							row[j * 2] = counts.get(i).getItem1();
-							row[(j * 2) + 1] = counts.get(i).getItem2() == -1 ? "" : counts.get(i).getItem2();
-						} else {
-							row[j * 2] = "";
-							row[(j * 2) + 1] = "";
-						}
-					}
-					addRow(sheet, row);
-				}
-				// Save some memory by derefencing tables already included in the report:
-				tableToFieldInfos.remove(table);
-			}
-		}
-
-		try {
-			FileOutputStream out = new FileOutputStream(new File(filename));
-			workbook.write(out);
-			out.close();
-			StringUtilities.outputWithTime("Scan report generated: " + filename);
-		} catch (IOException e) {
-			throw new RuntimeException(e.getMessage());
-		}
-	}
-
-	private void removeEmptyTables(Map<String, List<FieldInfo>> tableToFieldInfos) {
-		Iterator<Map.Entry<String, List<FieldInfo>>> iterator = tableToFieldInfos.entrySet().iterator();
-		while (iterator.hasNext()) {
-			if (iterator.next().getValue().size() == 0)
-				iterator.remove();
-		}
-	}
-
-	private List<FieldInfo> processDatabaseTable(String table, RichConnection connection) {
-		StringUtilities.outputWithTime("Scanning table " + table);
-
-		long rowCount = connection.getTableSize(table);
-		List<FieldInfo> fieldInfos = fetchTableStructure(connection, table);
-		if (scanValues) {
-			int actualCount = 0;
-			QueryResult queryResult = null;
-			try {
-				queryResult = fetchRowsFromTable(connection, table, rowCount);
-				for (org.ohdsi.utilities.files.Row row : queryResult) {
-					for (int i = 0; i < fieldInfos.size(); i++)
-						fieldInfos.get(i).processValue(row.get(fieldInfos.get(i).name));
-					actualCount++;
-					if (sampleSize != -1 && actualCount >= sampleSize) {
-						System.out.println("Stopped after " + actualCount + " rows");
-						break;
-					}
-				}
-				for (FieldInfo fieldInfo : fieldInfos)
-					fieldInfo.trim();
-			} catch (Exception e) {
-				System.out.println("Error: " + e.getMessage());
-			} finally {
-				if (queryResult != null) {
-					queryResult.close();
-				}
-			}
-		}
-
-		return fieldInfos;
-	}
-
-	private QueryResult fetchRowsFromTable(RichConnection connection, String table, long rowCount) {
-		String query = null;
-
-		if (sampleSize == -1) {
-			if (dbType == DbType.MSSQL || dbType == DbType.PDW || dbType == DbType.MSACCESS)
-				query = "SELECT * FROM [" + table + "]";
-			else
-				query = "SELECT * FROM " + table;
-		} else {
-			if (dbType == DbType.MSSQL)
-				query = "SELECT * FROM [" + table + "] TABLESAMPLE (" + sampleSize + " ROWS)";
-			else if (dbType == DbType.MYSQL)
-				query = "SELECT * FROM " + table + " ORDER BY RAND() LIMIT " + sampleSize;
-			else if (dbType == DbType.PDW)
-				query = "SELECT TOP " + sampleSize + " * FROM [" + table + "] ORDER BY RAND()";
-			else if (dbType == DbType.ORACLE) {
-				if (sampleSize < rowCount) {
-					double percentage = 100 * sampleSize / (double) rowCount;
-					if (percentage < 100)
-						query = "SELECT * FROM " + table + " SAMPLE(" + percentage + ")";
-				} else {
-					query = "SELECT * FROM " + table;
-				}
-			} else if (dbType == DbType.POSTGRESQL || dbType == DbType.REDSHIFT)
-				query = "SELECT * FROM " + table + " ORDER BY RANDOM() LIMIT " + sampleSize;
-			else if (dbType == DbType.MSACCESS)
-				query = "SELECT " + "TOP " + sampleSize + " * FROM [" + table + "]";
-		}
-		// System.out.println("SQL: " + query);
-		return connection.query(query);
-
-	}
-
-	private List<FieldInfo> fetchTableStructure(RichConnection connection, String table) {
-		List<FieldInfo> fieldInfos = new ArrayList<FieldInfo>();
-
-		if (dbType == DbType.MSACCESS) {
-			ResultSet rs = connection.getMsAccessFieldNames(table);
-			try {
-				while (rs.next()) {
-					FieldInfo fieldInfo = new FieldInfo(rs.getString("COLUMN_NAME"));
-					fieldInfo.type = rs.getString("TYPE_NAME");
-					fieldInfo.rowCount = connection.getTableSize(table);
-					fieldInfos.add(fieldInfo);
-				}
-			} catch (SQLException e) {
-				throw new RuntimeException(e.getMessage());
-			}
-		} else {
-			String query = null;
-			if (dbType == DbType.ORACLE)
-				query = "SELECT COLUMN_NAME,DATA_TYPE FROM ALL_TAB_COLUMNS WHERE table_name = '" + table + "' AND owner = '" + database.toUpperCase() + "'";
-			else if (dbType == DbType.MSSQL || dbType == DbType.PDW) {
-				String trimmedDatabase = database;
-				if (database.startsWith("[") && database.endsWith("]"))
-					trimmedDatabase = database.substring(1, database.length() - 1);
-				query = "SELECT COLUMN_NAME,DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_CATALOG='" + trimmedDatabase + "' AND TABLE_NAME='" + table
-						+ "';";
-			} else if (dbType == DbType.MYSQL)
-				query = "SELECT COLUMN_NAME,DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '" + database + "' AND TABLE_NAME = '" + table
-						+ "';";
-			else if (dbType == DbType.POSTGRESQL || dbType == DbType.REDSHIFT)
-				query = "SELECT COLUMN_NAME,DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '" + database.toLowerCase() + "' AND TABLE_NAME = '"
-						+ table.toLowerCase() + "' ORDER BY ordinal_position;";
-			else if (dbType == DbType.TERADATA) {
-				query = "SELECT ColumnName, ColumnType FROM dbc.columns WHERE DatabaseName= '" + database.toLowerCase() + "' AND TableName = '"
-						+ table.toLowerCase() + "';";
-			}
-
-			for (org.ohdsi.utilities.files.Row row : connection.query(query)) {
-				row.upperCaseFieldNames();
-				FieldInfo fieldInfo;
-				if (dbType == DbType.TERADATA) {
-					fieldInfo = new FieldInfo(row.get("COLUMNNAME"));
-				} else {
-					fieldInfo = new FieldInfo(row.get("COLUMN_NAME"));
-				}
-				if (dbType == DbType.TERADATA) {
-					fieldInfo.type = row.get("COLUMNTYPE");
-				} else {
-					fieldInfo.type = row.get("DATA_TYPE");
-				}
-				fieldInfo.rowCount = connection.getTableSize(table);
-				;
-				fieldInfos.add(fieldInfo);
-			}
-		}
-		return fieldInfos;
-	}
-
-	private List<FieldInfo> processCsvFile(String filename) {
-		StringUtilities.outputWithTime("Scanning table " + filename);
-		List<FieldInfo> fieldInfos = new ArrayList<FieldInfo>();
-		int lineNr = 0;
-		for (String line : new ReadTextFile(filename)) {
-			lineNr++;
-			List<String> row = StringUtilities.safeSplit(line, delimiter);
-			for (int i = 0; i < row.size(); i++) {
-				String column = row.get(i);
-				if (column.startsWith("\"") && column.endsWith("\"") && column.length() > 1)
-					column = column.substring(1, column.length() - 1);
-				column = column.replace("\\\"", "\"");
-				row.set(i, column);
-			}
-			if (lineNr == 1) {
-				for (String cell : row)
-					fieldInfos.add(new FieldInfo(cell));
-			} else {
-				if (row.size() == fieldInfos.size()) { // Else there appears to be a formatting error, so skip
-					for (int i = 0; i < row.size(); i++)
-						fieldInfos.get(i).processValue(row.get(i));
-				}
-			}
-			if (sampleSize != -1 && lineNr == sampleSize)
-				break;
-		}
-		for (FieldInfo fieldInfo : fieldInfos)
-			fieldInfo.trim();
-
-		return fieldInfos;
-	}
-
-	private class FieldInfo {
-		public String				type;
-		public String				name;
-		public CountingSet<String>	valueCounts		= new CountingSet<String>();
-		public long					sumLength		= 0;
-		public int					maxLength		= 0;
-		public long					nProcessed		= 0;
-		public long					emptyCount		= 0;
-		public long					rowCount		= -1;
-		public boolean				isInteger		= true;
-		public boolean				isReal			= true;
-		public boolean				isDate			= true;
-		public boolean				isFreeText		= false;
-		public boolean				tooManyValues	= false;
-
-		public FieldInfo(String name) {
-			this.name = name;
-		}
-
-		public void trim() {
-			if (valueCounts.size() > maxValues)
-				valueCounts.keepTopN(maxValues);
-		}
-
-		public Double getFractionEmpty() {
-			if (nProcessed == 0)
-				return 0d;
-			else
-				return emptyCount / (double) nProcessed;
-		}
-
-		public String getTypeDescription() {
-			if (type != null)
-				return type;
-			else if (nProcessed == emptyCount)
-				return "empty";
-			else if (isFreeText)
-				return "text";
-			else if (isDate)
-				return "date";
-			else if (isInteger)
-				return "int";
-			else if (isReal)
-				return "real";
-			else
-				return "varchar";
-		}
-
-		public void processValue(String value) {
-			String trimValue = value.trim();
-			nProcessed++;
-			sumLength += value.length();
-			if (value.length() > maxLength)
-				maxLength = value.length();
-
-			if (trimValue.length() == 0)
-				emptyCount++;
-
-			if (!isFreeText) {
-				valueCounts.add(value);
-
-				if (trimValue.length() != 0) {
-					if (isReal && !StringUtilities.isNumber(trimValue))
-						isReal = false;
-					if (isInteger && !StringUtilities.isLong(trimValue))
-						isInteger = false;
-					if (isDate && !StringUtilities.isDate(trimValue))
-						isDate = false;
-				}
-				if (nProcessed == N_FOR_FREE_TEXT_CHECK) {
-					if (!isInteger && !isReal && !isDate) {
-						double averageLength = sumLength / (double) (nProcessed - emptyCount);
-						if (averageLength >= MIN_AVERAGE_LENGTH_FOR_FREE_TEXT) {
-							isFreeText = true;
-							CountingSet<String> wordCounts = new CountingSet<String>();
-							for (Map.Entry<String, Count> entry : valueCounts.key2count.entrySet())
-								for (String word : StringUtilities.mapToWords(entry.getKey().toLowerCase()))
-									wordCounts.add(word, entry.getValue().count);
-							valueCounts = wordCounts;
-						}
-					}
-				}
-			} else {
-				for (String word : StringUtilities.mapToWords(trimValue.toLowerCase()))
-					valueCounts.add(word);
-			}
-
-			if (!tooManyValues && valueCounts.size() > MAX_VALUES_IN_MEMORY) {
-				tooManyValues = true;
-				valueCounts.keepTopN(maxValues);
-			}
-		}
-
-		public List<Pair<String, Integer>> getSortedValuesWithoutSmallValues() {
-			boolean truncated = false;
-			List<Pair<String, Integer>> result = new ArrayList<Pair<String, Integer>>();
-
-			for (Map.Entry<String, Count> entry : valueCounts.key2count.entrySet()) {
-				if (entry.getValue().count < minCellCount)
-					truncated = true;
-				else {
-					result.add(new Pair<String, Integer>(entry.getKey(), entry.getValue().count));
-					if (result.size() > maxValues) {
-						truncated = true;
-						break;
-					}
-				}
-			}
-
-			Collections.sort(result, new Comparator<Pair<String, Integer>>() {
-				public int compare(Pair<String, Integer> o1, Pair<String, Integer> o2) {
-					return o2.getItem2().compareTo(o1.getItem2());
-				}
-			});
-			if (truncated)
-				result.add(new Pair<String, Integer>("List truncated...", -1));
-			return result;
-		}
-	}
-
-	private void addRow(Sheet sheet, Object... values) {
-		Row row = sheet.createRow(sheet.getPhysicalNumberOfRows());
-		for (Object value : values) {
-			Cell cell = row.createCell(row.getPhysicalNumberOfCells());
-
-			if (value instanceof Integer || value instanceof Long || value instanceof Double)
-				cell.setCellValue(Double.parseDouble(value.toString()));
-			else
-				cell.setCellValue(value.toString());
-
-		}
-	}
-}
+/*******************************************************************************
+ * Copyright 2017 Observational Health Data Sciences and Informatics
+ * 
+ * This file is part of WhiteRabbit
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+package org.ohdsi.whiteRabbit.scan;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.xssf.streaming.SXSSFWorkbook;
+import org.ohdsi.databases.DbType;
+import org.ohdsi.databases.RichConnection;
+import org.ohdsi.databases.RichConnection.QueryResult;
+import org.ohdsi.utilities.StringUtilities;
+import org.ohdsi.utilities.collections.CountingSet;
+import org.ohdsi.utilities.collections.CountingSet.Count;
+import org.ohdsi.utilities.collections.Pair;
+import org.ohdsi.utilities.files.ReadTextFile;
+import org.ohdsi.whiteRabbit.DbSettings;
+
+public class SourceDataScan {
+
+	public static int	MAX_VALUES_IN_MEMORY				= 100000;
+	public static int	MIN_CELL_COUNT_FOR_CSV				= 1000000;
+	public static int	N_FOR_FREE_TEXT_CHECK				= 1000;
+	public static int	MIN_AVERAGE_LENGTH_FOR_FREE_TEXT	= 100;
+
+	private char		delimiter							= ',';
+	private int			sampleSize;
+	private boolean		scanValues;
+	private int			minCellCount;
+	private int			maxValues;
+	private DbType		dbType;
+	private String		database;
+
+	public void process(DbSettings dbSettings, int sampleSize, boolean scanValues, int minCellCount, int maxValues, String filename) {
+		this.sampleSize = sampleSize;
+		this.scanValues = scanValues;
+		this.minCellCount = minCellCount;
+		this.maxValues = maxValues;
+		Map<String, List<FieldInfo>> tableToFieldInfos;
+		if (dbSettings.dataType == DbSettings.CSVFILES) {
+			if (!scanValues)
+				minCellCount = Math.max(minCellCount, MIN_CELL_COUNT_FOR_CSV);
+			tableToFieldInfos = processCsvFiles(dbSettings);
+		} else
+			tableToFieldInfos = processDatabase(dbSettings);
+		generateReport(tableToFieldInfos, filename);
+	}
+
+	private Map<String, List<FieldInfo>> processDatabase(DbSettings dbSettings) {
+		Map<String, List<FieldInfo>> tableToFieldInfos = new HashMap<String, List<FieldInfo>>();
+		RichConnection connection = new RichConnection(dbSettings.server, dbSettings.domain, dbSettings.user, dbSettings.password, dbSettings.dbType);
+		connection.setVerbose(false);
+		connection.use(dbSettings.database);
+
+		dbType = dbSettings.dbType;
+		database = dbSettings.database;
+
+		for (String table : dbSettings.tables) {
+			List<FieldInfo> fieldInfos = processDatabaseTable(table, connection);
+			tableToFieldInfos.put(table, fieldInfos);
+		}
+
+		connection.close();
+		return tableToFieldInfos;
+	}
+
+	private Map<String, List<FieldInfo>> processCsvFiles(DbSettings dbSettings) {
+		delimiter = dbSettings.delimiter;
+		Map<String, List<FieldInfo>> tableToFieldInfos = new HashMap<String, List<FieldInfo>>();
+		for (String table : dbSettings.tables) {
+			List<FieldInfo> fieldInfos = processCsvFile(table);
+			String tableName = new File(table).getName();
+			if (!tableToFieldInfos.containsKey(tableName)) {
+				tableToFieldInfos.put(tableName, fieldInfos);
+			} else {
+				tableToFieldInfos.put(table, fieldInfos);
+			}
+
+		}
+		return tableToFieldInfos;
+	}
+
+	private void generateReport(Map<String, List<FieldInfo>> tableToFieldInfos, String filename) {
+		System.out.println("Generating scan report");
+		removeEmptyTables(tableToFieldInfos);
+		List<String> tables = new ArrayList<String>(tableToFieldInfos.keySet());
+		Collections.sort(tables);
+
+		SXSSFWorkbook workbook = new SXSSFWorkbook(100); // keep 100 rows in memory, exceeding rows will be flushed to disk
+
+		// Create overview sheet
+		Sheet sheet = workbook.createSheet("Overview");
+		if (!scanValues) {
+			addRow(sheet, "Table", "Field", "Type", "N rows");
+			for (String table : tables) {
+				for (FieldInfo fieldInfo : tableToFieldInfos.get(table))
+					addRow(sheet, table, fieldInfo.name, fieldInfo.getTypeDescription(), Long.valueOf(fieldInfo.rowCount));
+				addRow(sheet, "");
+			}
+		} else {
+			addRow(sheet, "Table", "Field", "Type", "Max length", "N rows", "N rows checked", "Fraction empty");
+			for (String table : tables) {
+				for (FieldInfo fieldInfo : tableToFieldInfos.get(table))
+					addRow(sheet, table, fieldInfo.name, fieldInfo.getTypeDescription(), Integer.valueOf(fieldInfo.maxLength), Long.valueOf(fieldInfo.rowCount),
+							Long.valueOf(fieldInfo.nProcessed), fieldInfo.getFractionEmpty());
+				addRow(sheet, "");
+			}
+
+			// Create per table sheets
+			for (String table : tables) {
+				sheet = workbook.createSheet(table);
+				List<FieldInfo> fieldInfos = tableToFieldInfos.get(table);
+				List<List<Pair<String, Integer>>> valueCounts = new ArrayList<List<Pair<String, Integer>>>();
+				Object[] header = new Object[fieldInfos.size() * 2];
+				int maxCount = 0;
+				for (int i = 0; i < fieldInfos.size(); i++) {
+					FieldInfo fieldInfo = fieldInfos.get(i);
+					header[i * 2] = fieldInfo.name;
+					if (fieldInfo.isFreeText)
+						header[(i * 2) + 1] = "Word count";
+					else
+						header[(i * 2) + 1] = "Frequency";
+					List<Pair<String, Integer>> counts = fieldInfo.getSortedValuesWithoutSmallValues();
+					valueCounts.add(counts);
+					if (counts.size() > maxCount)
+						maxCount = counts.size();
+				}
+				addRow(sheet, header);
+				for (int i = 0; i < maxCount; i++) {
+					Object[] row = new Object[fieldInfos.size() * 2];
+					for (int j = 0; j < fieldInfos.size(); j++) {
+						List<Pair<String, Integer>> counts = valueCounts.get(j);
+						if (counts.size() > i) {
+							row[j * 2] = counts.get(i).getItem1();
+							row[(j * 2) + 1] = counts.get(i).getItem2() == -1 ? "" : counts.get(i).getItem2();
+						} else {
+							row[j * 2] = "";
+							row[(j * 2) + 1] = "";
+						}
+					}
+					addRow(sheet, row);
+				}
+				// Save some memory by derefencing tables already included in the report:
+				tableToFieldInfos.remove(table);
+			}
+		}
+
+		try {
+			FileOutputStream out = new FileOutputStream(new File(filename));
+			workbook.write(out);
+			out.close();
+			StringUtilities.outputWithTime("Scan report generated: " + filename);
+		} catch (IOException e) {
+			throw new RuntimeException(e.getMessage());
+		}
+	}
+
+	private void removeEmptyTables(Map<String, List<FieldInfo>> tableToFieldInfos) {
+		Iterator<Map.Entry<String, List<FieldInfo>>> iterator = tableToFieldInfos.entrySet().iterator();
+		while (iterator.hasNext()) {
+			if (iterator.next().getValue().size() == 0)
+				iterator.remove();
+		}
+	}
+
+	private List<FieldInfo> processDatabaseTable(String table, RichConnection connection) {
+		StringUtilities.outputWithTime("Scanning table " + table);
+
+		long rowCount = connection.getTableSize(table);
+		List<FieldInfo> fieldInfos = fetchTableStructure(connection, table);
+		if (scanValues) {
+			int actualCount = 0;
+			QueryResult queryResult = null;
+			try {
+				queryResult = fetchRowsFromTable(connection, table, rowCount);
+				for (org.ohdsi.utilities.files.Row row : queryResult) {
+					for (int i = 0; i < fieldInfos.size(); i++)
+						fieldInfos.get(i).processValue(row.get(fieldInfos.get(i).name));
+					actualCount++;
+					if (sampleSize != -1 && actualCount >= sampleSize) {
+						System.out.println("Stopped after " + actualCount + " rows");
+						break;
+					}
+				}
+				for (FieldInfo fieldInfo : fieldInfos)
+					fieldInfo.trim();
+			} catch (Exception e) {
+				System.out.println("Error: " + e.getMessage());
+			} finally {
+				if (queryResult != null) {
+					queryResult.close();
+				}
+			}
+		}
+
+		return fieldInfos;
+	}
+
+	private QueryResult fetchRowsFromTable(RichConnection connection, String table, long rowCount) {
+		String query = null;
+
+		if (sampleSize == -1) {
+			if (dbType == DbType.MSSQL || dbType == DbType.PDW || dbType == DbType.MSACCESS)
+				query = "SELECT * FROM [" + table + "]";
+			else
+				query = "SELECT * FROM " + table;
+		} else {
+			if (dbType == DbType.MSSQL)
+				query = "SELECT * FROM [" + table + "] TABLESAMPLE (" + sampleSize + " ROWS)";
+			else if (dbType == DbType.MYSQL)
+				query = "SELECT * FROM " + table + " ORDER BY RAND() LIMIT " + sampleSize;
+			else if (dbType == DbType.PDW)
+				query = "SELECT TOP " + sampleSize + " * FROM [" + table + "] ORDER BY RAND()";
+			else if (dbType == DbType.ORACLE) {
+				if (sampleSize < rowCount) {
+					double percentage = 100 * sampleSize / (double) rowCount;
+					if (percentage < 100)
+						query = "SELECT * FROM " + table + " SAMPLE(" + percentage + ")";
+				} else {
+					query = "SELECT * FROM " + table;
+				}
+			} else if (dbType == DbType.POSTGRESQL || dbType == DbType.REDSHIFT)
+				query = "SELECT * FROM " + table + " ORDER BY RANDOM() LIMIT " + sampleSize;
+			else if (dbType == DbType.MSACCESS)
+				query = "SELECT " + "TOP " + sampleSize + " * FROM [" + table + "]";
+		}
+		// System.out.println("SQL: " + query);
+		return connection.query(query);
+
+	}
+
+	private List<FieldInfo> fetchTableStructure(RichConnection connection, String table) {
+		List<FieldInfo> fieldInfos = new ArrayList<FieldInfo>();
+
+		if (dbType == DbType.MSACCESS) {
+			ResultSet rs = connection.getMsAccessFieldNames(table);
+			try {
+				while (rs.next()) {
+					FieldInfo fieldInfo = new FieldInfo(rs.getString("COLUMN_NAME"));
+					fieldInfo.type = rs.getString("TYPE_NAME");
+					fieldInfo.rowCount = connection.getTableSize(table);
+					fieldInfos.add(fieldInfo);
+				}
+			} catch (SQLException e) {
+				throw new RuntimeException(e.getMessage());
+			}
+		} else {
+			String query = null;
+			if (dbType == DbType.ORACLE)
+				query = "SELECT COLUMN_NAME,DATA_TYPE FROM ALL_TAB_COLUMNS WHERE table_name = '" + table + "' AND owner = '" + database.toUpperCase() + "'";
+			else if (dbType == DbType.MSSQL || dbType == DbType.PDW) {
+				String trimmedDatabase = database;
+				if (database.startsWith("[") && database.endsWith("]"))
+					trimmedDatabase = database.substring(1, database.length() - 1);
+				query = "SELECT COLUMN_NAME,DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_CATALOG='" + trimmedDatabase + "' AND TABLE_NAME='" + table
+						+ "';";
+			} else if (dbType == DbType.MYSQL)
+				query = "SELECT COLUMN_NAME,DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '" + database + "' AND TABLE_NAME = '" + table
+						+ "';";
+			else if (dbType == DbType.POSTGRESQL || dbType == DbType.REDSHIFT)
+				query = "SELECT COLUMN_NAME,DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '" + database.toLowerCase() + "' AND TABLE_NAME = '"
+						+ table.toLowerCase() + "' ORDER BY ordinal_position;";
+			else if (dbType == DbType.TERADATA) {
+				query = "SELECT ColumnName, ColumnType FROM dbc.columns WHERE DatabaseName= '" + database.toLowerCase() + "' AND TableName = '"
+						+ table.toLowerCase() + "';";
+			}
+
+			for (org.ohdsi.utilities.files.Row row : connection.query(query)) {
+				row.upperCaseFieldNames();
+				FieldInfo fieldInfo;
+				if (dbType == DbType.TERADATA) {
+					fieldInfo = new FieldInfo(row.get("COLUMNNAME"));
+				} else {
+					fieldInfo = new FieldInfo(row.get("COLUMN_NAME"));
+				}
+				if (dbType == DbType.TERADATA) {
+					fieldInfo.type = row.get("COLUMNTYPE");
+				} else {
+					fieldInfo.type = row.get("DATA_TYPE");
+				}
+				fieldInfo.rowCount = connection.getTableSize(table);
+				;
+				fieldInfos.add(fieldInfo);
+			}
+		}
+		return fieldInfos;
+	}
+
+	private List<FieldInfo> processCsvFile(String filename) {
+		StringUtilities.outputWithTime("Scanning table " + filename);
+		List<FieldInfo> fieldInfos = new ArrayList<FieldInfo>();
+		int lineNr = 0;
+		for (String line : new ReadTextFile(filename)) {
+			lineNr++;
+			List<String> row = StringUtilities.safeSplit(line, delimiter);
+			for (int i = 0; i < row.size(); i++) {
+				String column = row.get(i);
+				if (column.startsWith("\"") && column.endsWith("\"") && column.length() > 1)
+					column = column.substring(1, column.length() - 1);
+				column = column.replace("\\\"", "\"");
+				row.set(i, column);
+			}
+			if (lineNr == 1) {
+				for (String cell : row)
+					fieldInfos.add(new FieldInfo(cell));
+			} else {
+				if (row.size() == fieldInfos.size()) { // Else there appears to be a formatting error, so skip
+					for (int i = 0; i < row.size(); i++)
+						fieldInfos.get(i).processValue(row.get(i));
+				}
+			}
+			if (sampleSize != -1 && lineNr == sampleSize)
+				break;
+		}
+		for (FieldInfo fieldInfo : fieldInfos)
+			fieldInfo.trim();
+
+		return fieldInfos;
+	}
+
+	private class FieldInfo {
+		public String				type;
+		public String				name;
+		public CountingSet<String>	valueCounts		= new CountingSet<String>();
+		public long					sumLength		= 0;
+		public int					maxLength		= 0;
+		public long					nProcessed		= 0;
+		public long					emptyCount		= 0;
+		public long					rowCount		= -1;
+		public boolean				isInteger		= true;
+		public boolean				isReal			= true;
+		public boolean				isDate			= true;
+		public boolean				isFreeText		= false;
+		public boolean				tooManyValues	= false;
+
+		public FieldInfo(String name) {
+			this.name = name;
+		}
+
+		public void trim() {
+			if (valueCounts.size() > maxValues)
+				valueCounts.keepTopN(maxValues);
+		}
+
+		public Double getFractionEmpty() {
+			if (nProcessed == 0)
+				return 0d;
+			else
+				return emptyCount / (double) nProcessed;
+		}
+
+		public String getTypeDescription() {
+			if (type != null)
+				return type;
+			else if (nProcessed == emptyCount)
+				return "empty";
+			else if (isFreeText)
+				return "text";
+			else if (isDate)
+				return "date";
+			else if (isInteger)
+				return "int";
+			else if (isReal)
+				return "real";
+			else
+				return "varchar";
+		}
+
+		public void processValue(String value) {
+			String trimValue = value.trim();
+			nProcessed++;
+			sumLength += value.length();
+			if (value.length() > maxLength)
+				maxLength = value.length();
+
+			if (trimValue.length() == 0)
+				emptyCount++;
+
+			if (!isFreeText) {
+				valueCounts.add(value);
+
+				if (trimValue.length() != 0) {
+					if (isReal && !StringUtilities.isNumber(trimValue))
+						isReal = false;
+					if (isInteger && !StringUtilities.isLong(trimValue))
+						isInteger = false;
+					if (isDate && !StringUtilities.isDate(trimValue))
+						isDate = false;
+				}
+				if (nProcessed == N_FOR_FREE_TEXT_CHECK) {
+					if (!isInteger && !isReal && !isDate) {
+						double averageLength = sumLength / (double) (nProcessed - emptyCount);
+						if (averageLength >= MIN_AVERAGE_LENGTH_FOR_FREE_TEXT) {
+							isFreeText = true;
+							CountingSet<String> wordCounts = new CountingSet<String>();
+							for (Map.Entry<String, Count> entry : valueCounts.key2count.entrySet())
+								for (String word : StringUtilities.mapToWords(entry.getKey().toLowerCase()))
+									wordCounts.add(word, entry.getValue().count);
+							valueCounts = wordCounts;
+						}
+					}
+				}
+			} else {
+				for (String word : StringUtilities.mapToWords(trimValue.toLowerCase()))
+					valueCounts.add(word);
+			}
+
+			if (!tooManyValues && valueCounts.size() > MAX_VALUES_IN_MEMORY) {
+				tooManyValues = true;
+				valueCounts.keepTopN(maxValues);
+			}
+		}
+
+		public List<Pair<String, Integer>> getSortedValuesWithoutSmallValues() {
+			boolean truncated = false;
+			List<Pair<String, Integer>> result = new ArrayList<Pair<String, Integer>>();
+
+			for (Map.Entry<String, Count> entry : valueCounts.key2count.entrySet()) {
+				if (entry.getValue().count < minCellCount)
+					truncated = true;
+				else {
+					result.add(new Pair<String, Integer>(entry.getKey(), entry.getValue().count));
+					if (result.size() > maxValues) {
+						truncated = true;
+						break;
+					}
+				}
+			}
+
+			Collections.sort(result, new Comparator<Pair<String, Integer>>() {
+				public int compare(Pair<String, Integer> o1, Pair<String, Integer> o2) {
+					return o2.getItem2().compareTo(o1.getItem2());
+				}
+			});
+			if (truncated)
+				result.add(new Pair<String, Integer>("List truncated...", -1));
+			return result;
+		}
+	}
+
+	private void addRow(Sheet sheet, Object... values) {
+		Row row = sheet.createRow(sheet.getPhysicalNumberOfRows());
+		for (Object value : values) {
+			Cell cell = row.createCell(row.getPhysicalNumberOfCells());
+
+			if (value instanceof Integer || value instanceof Long || value instanceof Double)
+				cell.setCellValue(Double.parseDouble(value.toString()));
+			else
+				cell.setCellValue(value.toString());
+
+		}
+	}
+}