Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor fix for tika validation. #96

Open
wants to merge 1 commit into
base: MOODLE_310_STABLE
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions classes/enrich/text/tika.php
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ private function tika_server_ready() {
$url = '';
// Check if we have a valid set of config.
if (! empty($this->config->tikahostname) && ! empty($this->config->tikaport)) {
$port = $this->config->port;
$hostname = rtrim($this->config->hostname, "/");
$port = $this->config->tikaport;
$hostname = rtrim($this->config->tikahostname, "/");
$url = $hostname . ':' . $port;
}

Expand Down Expand Up @@ -138,8 +138,8 @@ public function can_analyze($file) {
/**
* Use tika to extract text from file.
*
* @param file $file
* @param esrequest\client $client client
* @param \stored_file $file
* @param \search_elastic\esrequest $client client
* @return string|boolean
*/
public function extract_text($file, $client) {
Expand Down Expand Up @@ -203,4 +203,3 @@ public static function form_definition_extra($form, $mform, $customdata, $config
}

}

116 changes: 116 additions & 0 deletions cli/tika_config_tester.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
<?php
// This file is part of Moodle - http://moodle.org/
//
// Moodle is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// Moodle is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Moodle. If not, see <http://www.gnu.org/licenses/>.

/**
* CLI config tester
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you considered move this CLI to UI instead? E.g. after submitting setting we could check if it actually works and then display results?

*
* @package search
* @copyright 2023 David Castro <[email protected]>
* @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
*/
define('CLI_SCRIPT', true);

require(__DIR__.'/../../../../config.php');
require_once($CFG->libdir.'/clilib.php'); // Cli only functions.

list($options, $unrecognized) = cli_get_params([
'help' => false,
'testfileid' => '',
], [
'h' => 'help',
't' => 'testfileid',
]);

if ($unrecognized) {
$unrecognized = implode("\n ", $unrecognized);
cli_error(get_string('cliunknowoption', 'admin', $unrecognized));
}

if ($options['help']) {
$help = "
Run Tika diagnostics.

Options:
-h, --help Print out this help
-t, --testfileid (Optional) PDF or accepted file id to send to tika for analysis
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where do we get testfileid from? Is it id from mdl_files? If so, then this is not user friendly. Maybe we can pass a path to a test file instead? Or we could use files from tests/fixtures?


Examples:
\$ sudo -u www-data /usr/bin/php search/engine/elastic/cli/tika_config_tester.php -t=<file id>
";

echo $help;
die;
}

/**
* Inspired by \search_elastic\enrich\text\tika::tika_server_ready.
* Outputs cli messages on error.
*/
function tika_server_ready() {
$tikahostname = get_config('search_elastic', 'tikahostname');
$tikaport = get_config('search_elastic', 'tikaport');

$returnval = false;
$client = new \search_elastic\esrequest();
$url = '';
// Check if we have a valid set of config.
if (!empty($tikahostname) && !empty($tikaport)) {
$port = $tikaport;
$hostname = rtrim($tikahostname, "/");
$url = $hostname . ':' . $port;
} else {
cli_writeln('tikahostname or tikaport are not set in elasticsearch config');
}

// Check we can reach Tika server.
if ($url !== '') {
$response = $client->get($url);
$responsecode = $response->getStatusCode();

if ($responsecode == 200) {
$returnval = true;
} else {
$error = 'Undetermined';
if (method_exists($response, 'getBody')) {
// This might be transformed into a guzzleexception.
// We need to check if it is still a response.
$error = $response->getBody();
}
cli_error("Making a GET request to $url resulted in error:\nHTTP Code: $responsecode\nResponse: $error");
}
}

return $returnval;
}

$canusetika = tika_server_ready();
if (!$canusetika) {
cli_error("Tika cannot be used. Please verify plugin configuration.");
}
cli_writeln('Connection to tika was successful!');

$fileid = $options['testfileid'];
if (empty($fileid)) {
cli_writeln('No file id specified, exiting.');
exit(0);
}

$tika = new \search_elastic\enrich\text\tika(get_config('search_elastic'));
$fs = get_file_storage();
$file = $fs->get_file_by_id($fileid);
$text = $tika->analyze_file($file);

cli_writeln('Text found in file ' . $file->get_filename() . ': ' . $text);