From 8a595fa4504be061d3b8b370c71b89e03a720194 Mon Sep 17 00:00:00 2001 From: David Castro Date: Fri, 17 Mar 2023 18:41:03 -0500 Subject: [PATCH] Minor fix for tika validation. Adding a CLI script for testing tika too. --- classes/enrich/text/tika.php | 9 ++- cli/tika_config_tester.php | 116 +++++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+), 5 deletions(-) create mode 100644 cli/tika_config_tester.php diff --git a/classes/enrich/text/tika.php b/classes/enrich/text/tika.php index b48c8f6..c32152d 100644 --- a/classes/enrich/text/tika.php +++ b/classes/enrich/text/tika.php @@ -100,8 +100,8 @@ private function tika_server_ready() { $url = ''; // Check if we have a valid set of config. if (! empty($this->config->tikahostname) && ! empty($this->config->tikaport)) { - $port = $this->config->port; - $hostname = rtrim($this->config->hostname, "/"); + $port = $this->config->tikaport; + $hostname = rtrim($this->config->tikahostname, "/"); $url = $hostname . ':' . $port; } @@ -138,8 +138,8 @@ public function can_analyze($file) { /** * Use tika to extract text from file. * - * @param file $file - * @param esrequest\client $client client + * @param \stored_file $file + * @param \search_elastic\esrequest $client client * @return string|boolean */ public function extract_text($file, $client) { @@ -203,4 +203,3 @@ public static function form_definition_extra($form, $mform, $customdata, $config } } - diff --git a/cli/tika_config_tester.php b/cli/tika_config_tester.php new file mode 100644 index 0000000..216bc26 --- /dev/null +++ b/cli/tika_config_tester.php @@ -0,0 +1,116 @@ +. + +/** + * CLI config tester + * + * @package search + * @copyright 2023 David Castro + * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later + */ +define('CLI_SCRIPT', true); + +require(__DIR__.'/../../../../config.php'); +require_once($CFG->libdir.'/clilib.php'); // Cli only functions. + +list($options, $unrecognized) = cli_get_params([ + 'help' => false, + 'testfileid' => '', +], [ + 'h' => 'help', + 't' => 'testfileid', +]); + +if ($unrecognized) { + $unrecognized = implode("\n ", $unrecognized); + cli_error(get_string('cliunknowoption', 'admin', $unrecognized)); +} + +if ($options['help']) { + $help = " +Run Tika diagnostics. + +Options: +-h, --help Print out this help +-t, --testfileid (Optional) PDF or accepted file id to send to tika for analysis + +Examples: +\$ sudo -u www-data /usr/bin/php search/engine/elastic/cli/tika_config_tester.php -t= +"; + + echo $help; + die; +} + +/** + * Inspired by \search_elastic\enrich\text\tika::tika_server_ready. + * Outputs cli messages on error. + */ +function tika_server_ready() { + $tikahostname = get_config('search_elastic', 'tikahostname'); + $tikaport = get_config('search_elastic', 'tikaport'); + + $returnval = false; + $client = new \search_elastic\esrequest(); + $url = ''; + // Check if we have a valid set of config. + if (!empty($tikahostname) && !empty($tikaport)) { + $port = $tikaport; + $hostname = rtrim($tikahostname, "/"); + $url = $hostname . ':' . $port; + } else { + cli_writeln('tikahostname or tikaport are not set in elasticsearch config'); + } + + // Check we can reach Tika server. + if ($url !== '') { + $response = $client->get($url); + $responsecode = $response->getStatusCode(); + + if ($responsecode == 200) { + $returnval = true; + } else { + $error = 'Undetermined'; + if (method_exists($response, 'getBody')) { + // This might be transformed into a guzzleexception. + // We need to check if it is still a response. + $error = $response->getBody(); + } + cli_error("Making a GET request to $url resulted in error:\nHTTP Code: $responsecode\nResponse: $error"); + } + } + + return $returnval; +} + +$canusetika = tika_server_ready(); +if (!$canusetika) { + cli_error("Tika cannot be used. Please verify plugin configuration."); +} +cli_writeln('Connection to tika was successful!'); + +$fileid = $options['testfileid']; +if (empty($fileid)) { + cli_writeln('No file id specified, exiting.'); + exit(0); +} + +$tika = new \search_elastic\enrich\text\tika(get_config('search_elastic')); +$fs = get_file_storage(); +$file = $fs->get_file_by_id($fileid); +$text = $tika->analyze_file($file); + +cli_writeln('Text found in file ' . $file->get_filename() . ': ' . $text);