diff --git a/packages/playground/data-liberation/plugin.php b/packages/playground/data-liberation/plugin.php index ecf88569fe..3a0f15f7a4 100644 --- a/packages/playground/data-liberation/plugin.php +++ b/packages/playground/data-liberation/plugin.php @@ -8,13 +8,13 @@ /** * Don't run KSES on the attribute values during the import. - * + * * Without this filter, WP_HTML_Tag_Processor::set_attribute() will * assume the value is a URL and run KSES on it, which will incorrectly * prefix relative paths with http://. - * + * * For example: - * + * * > $html = new WP_HTML_Tag_Processor( '' ); * > $html->next_tag(); * > $html->set_attribute( 'src', './_assets/log-errors.png' ); @@ -25,6 +25,41 @@ return []; }); +/** + * Development debug code to run the import manually. + * @TODO: Remove this in favor of a CLI command. + */ +add_action('init', function() { + return; + $wxr_path = __DIR__ . '/tests/fixtures/wxr-simple.xml'; + $importer = WP_Stream_Importer::create_for_wxr_file( + $wxr_path + ); + while($importer->next_step()) { + // ... + } + return; + $importer->next_step(); + $paused_importer_state = $importer->get_reentrancy_cursor(); + + echo "\n\n"; + echo "moving to importer2\n"; + echo "\n\n"; + + $importer2 = WP_Stream_Importer::create_for_wxr_file( + $wxr_path, + array(), + $paused_importer_state + ); + $importer2->next_step(); + $importer2->next_step(); + $importer2->next_step(); + // $importer2->next_step(); + // var_dump($importer2); + + die("YAY"); +}); + // Register admin menu add_action('admin_menu', function() { add_menu_page( @@ -86,7 +121,7 @@ function data_liberation_admin_page() { data_liberation_process_import(); echo ''; } - + ?>
Import Type | @@ -175,7 +210,7 @@ function data_liberation_admin_page() {|
---|---|
WXR File | @@ -184,7 +219,7 @@ function data_liberation_admin_page() {|
WXR URL | @@ -193,7 +228,7 @@ function data_liberation_admin_page() {|
Markdown ZIP | @@ -210,7 +245,7 @@ function data_liberation_admin_page() {
Date | @@ -329,7 +364,7 @@ function data_liberation_admin_page() { */ // if(is_wp_error(wp_schedule_event(time(), 'data_liberation_minute', 'data_liberation_process_import'))) { // wp_delete_attachment($attachment_id, true); - // // @TODO: More user friendly error message – maybe redirect back to the import screen and + // // @TODO: More user friendly error message – maybe redirect back to the import screen and // // show the error there. // wp_die('Failed to schedule import – the "data_liberation_minute" schedule may not be registered.'); // } @@ -353,20 +388,9 @@ function data_liberation_process_import() { function data_liberation_import_step($import) { $importer = data_liberation_create_importer($import); - // @TODO: Save the last importer state so we can resume it later if interrupted. - update_option('data_liberation_import_progress', [ - 'status' => 'Downloading static assets...', - 'current' => 0, - 'total' => 0 - ]); - $importer->frontload_assets(); - // @TODO: Keep track of multiple progress dimensions – posts, assets, categories, etc. - update_option('data_liberation_import_progress', [ - 'status' => 'Importing posts...', - 'current' => 0, - 'total' => 0 - ]); - $importer->import_entities(); + while($importer->next_step()) { + // ...Twiddle our thumbs... + } delete_option('data_liberation_active_import'); // @TODO: Do not echo things. Append to an import log where we can retrace the steps. // Also, store specific import events in the database so the user can react and @@ -382,25 +406,13 @@ function data_liberation_create_importer($import) { // @TODO: Save the error, report it to the user. return; } - $entity_iterator_factory = function() use ($wxr_path) { - $wxr = new WP_WXR_Reader(); - $wxr->connect_upstream(new WP_File_Reader($wxr_path)); - - return $wxr; - }; - return WP_Stream_Importer::create( - $entity_iterator_factory + return WP_Stream_Importer::create_for_wxr_file( + $wxr_path ); case 'wxr_url': - $wxr_url = $import['wxr_url']; - $entity_iterator_factory = function() use ($wxr_url) { - $wxr = new WP_WXR_Reader(); - $wxr->connect_upstream(new WP_Remote_File_Reader($wxr_url)); - return $wxr; - }; - return WP_Stream_Importer::create( - $entity_iterator_factory + return WP_Stream_Importer::create_for_wxr_url( + $import['wxr_url'] ); case 'markdown_zip': @@ -419,18 +431,12 @@ function data_liberation_create_importer($import) { } } $markdown_root = $temp_dir; - $entity_iterator_factory = function() use ($markdown_root) { - return new WP_Markdown_Directory_Tree_Reader( - $markdown_root, - 1000 - ); - }; - return WP_Markdown_Importer::create( - $entity_iterator_factory, [ + return WP_Markdown_Importer::create_for_markdown_directory( + $markdown_root, [ 'source_site_url' => 'file://' . $markdown_root, 'local_markdown_assets_root' => $markdown_root, 'local_markdown_assets_url_prefix' => '@site/', ] ); } -} \ No newline at end of file +} diff --git a/packages/playground/data-liberation/src/byte-readers/WP_Byte_Reader.php b/packages/playground/data-liberation/src/byte-readers/WP_Byte_Reader.php index efc80bffd9..6d8bd247db 100644 --- a/packages/playground/data-liberation/src/byte-readers/WP_Byte_Reader.php +++ b/packages/playground/data-liberation/src/byte-readers/WP_Byte_Reader.php @@ -1,8 +1,8 @@ chunk_size = $chunk_size; } - /** - * Really these are just `tell()` and `seek()` operations, only the state is more - * involved than a simple offset. Hmm. - */ - public function pause(): array|bool { - return array( - 'offset_in_file' => $this->offset_in_file, - ); + public function tell(): int { + // Save the previous offset, not the current one. + // This way, after resuming, the next read will yield the same $output_bytes + // as we have now. + return $this->offset_in_file - $this->last_chunk_size; } - public function resume( $paused_state ): bool { + public function seek( $offset_in_file ): bool { + if ( ! is_int( $offset_in_file ) ) { + _doing_it_wrong( __METHOD__, 'Cannot set a file reader cursor to a non-integer offset.', '1.0.0' ); + return false; + } if ( $this->file_pointer ) { - _doing_it_wrong( __METHOD__, 'Cannot resume a file reader that is already initialized.', '1.0.0' ); + _doing_it_wrong( __METHOD__, 'Cannot set a file reader cursor on a file reader that is already initialized.', '1.0.0' ); return false; } - $this->offset_in_file = $paused_state['offset_in_file']; + $this->offset_in_file = $offset_in_file; + $this->last_chunk_size = 0; return true; } @@ -50,7 +53,8 @@ public function get_last_error(): string|null { } public function next_bytes(): bool { - $this->output_bytes = ''; + $this->output_bytes = ''; + $this->last_chunk_size = 0; if ( $this->last_error || $this->is_finished() ) { return false; } @@ -66,7 +70,8 @@ public function next_bytes(): bool { $this->state = static::STATE_FINISHED; return false; } - $this->offset_in_file += strlen( $bytes ); + $this->last_chunk_size = strlen( $bytes ); + $this->offset_in_file += $this->last_chunk_size; $this->output_bytes .= $bytes; return true; } diff --git a/packages/playground/data-liberation/src/byte-readers/WP_Remote_File_Reader.php b/packages/playground/data-liberation/src/byte-readers/WP_Remote_File_Reader.php index e9e19ba0b1..819480d8fe 100644 --- a/packages/playground/data-liberation/src/byte-readers/WP_Remote_File_Reader.php +++ b/packages/playground/data-liberation/src/byte-readers/WP_Remote_File_Reader.php @@ -22,6 +22,19 @@ public function __construct( $url ) { $this->url = $url; } + public function tell(): int { + return $this->bytes_already_read + $this->skip_bytes; + } + + public function seek( $offset_in_file ): bool { + if ( $this->request ) { + _doing_it_wrong( __METHOD__, 'Cannot set a remote file reader cursor on a remote file reader that is already initialized.', '1.0.0' ); + return false; + } + $this->skip_bytes = $offset_in_file; + return true; + } + public function next_bytes(): bool { if ( null === $this->request ) { $this->request = new WordPress\AsyncHttp\Request( @@ -90,21 +103,6 @@ public function get_bytes(): string|null { return $this->current_chunk; } - public function pause(): array|bool { - return array( - 'offset_in_file' => $this->bytes_already_read + $this->skip_bytes, - ); - } - - public function resume( $paused_state ): bool { - if ( $this->request ) { - _doing_it_wrong( __METHOD__, 'Cannot resume a remote file reader that is already initialized.', '1.0.0' ); - return false; - } - $this->skip_bytes = $paused_state['offset_in_file']; - return true; - } - public function is_finished(): bool { return $this->is_finished; } diff --git a/packages/playground/data-liberation/src/import/WP_Attachment_Downloader.php b/packages/playground/data-liberation/src/import/WP_Attachment_Downloader.php index a54ce96eda..a6be3e74f0 100644 --- a/packages/playground/data-liberation/src/import/WP_Attachment_Downloader.php +++ b/packages/playground/data-liberation/src/import/WP_Attachment_Downloader.php @@ -7,23 +7,25 @@ class WP_Attachment_Downloader { private $client; private $fps = array(); private $output_root; - private $partial_files = array(); - private $output_paths = array(); + private $output_paths = array(); + + private $current_event; + private $pending_events = array(); + private $downloads_so_far = 0; + private $enqueued_resource_id; public function __construct( $output_root ) { $this->client = new Client(); $this->output_root = $output_root; } - public function enqueue_if_not_exists( $url, $output_path = null ) { - if ( null === $output_path ) { - // Use the path from the URL. - $parsed_url = parse_url( $url ); - if ( false === $parsed_url ) { - return false; - } - $output_path = $parsed_url['path']; - } + public function has_pending_requests() { + return count( $this->client->get_active_requests() ) > 0; + } + + public function enqueue_if_not_exists( $url, $output_path ) { + $this->enqueued_resource_id = null; + $output_path = $this->output_root . '/' . ltrim( $output_path, '/' ); if ( file_exists( $output_path ) ) { // @TODO: Reconsider the return value. The enqueuing operation failed, @@ -42,18 +44,28 @@ public function enqueue_if_not_exists( $url, $output_path = null ) { return false; } + ++$this->downloads_so_far; switch ( $protocol ) { case 'file': $local_path = parse_url( $url, PHP_URL_PATH ); if ( false === $local_path ) { return false; } + // Just copy the file over. // @TODO: think through the chmod of the created file. - return copy( $local_path, $output_path ); + + $this->enqueued_resource_id = 'file:' . $this->downloads_so_far; + $success = copy( $local_path, $output_path ); + $this->pending_events[] = new WP_Attachment_Downloader_Event( + $this->enqueued_resource_id, + $success ? WP_Attachment_Downloader_Event::SUCCESS : WP_Attachment_Downloader_Event::FAILURE + ); + return true; case 'http': case 'https': $request = new Request( $url ); + $this->enqueued_resource_id = 'http:' . $request->id; $this->output_paths[ $request->id ] = $output_path; $this->client->enqueue( $request ); return true; @@ -61,10 +73,28 @@ public function enqueue_if_not_exists( $url, $output_path = null ) { return false; } + public function get_enqueued_resource_id() { + return $this->enqueued_resource_id; + } + public function queue_full() { return count( $this->client->get_active_requests() ) >= 10; } + public function get_event() { + return $this->current_event; + } + + public function next_event() { + $this->current_event = null; + if ( count( $this->pending_events ) === 0 ) { + return false; + } + + $this->current_event = array_shift( $this->pending_events ); + return true; + } + public function poll() { if ( ! $this->client->await_next_event() ) { return false; @@ -75,54 +105,63 @@ public function poll() { // Let's keep referring to the original request. $original_request_id = $request->original_request()->id; - switch ( $event ) { - case Client::EVENT_GOT_HEADERS: - if ( ! $request->is_redirected() ) { - $this->partial_files[ $original_request_id ] = $this->output_paths[ $original_request_id ] . '.partial'; - if ( file_exists( $this->partial_files[ $original_request_id ] ) ) { - unlink( $this->partial_files[ $original_request_id ] ); + while ( true ) { + switch ( $event ) { + case Client::EVENT_GOT_HEADERS: + if ( ! $request->is_redirected() ) { + if ( file_exists( $this->output_paths[ $original_request_id ] . '.partial' ) ) { + unlink( $this->output_paths[ $original_request_id ] . '.partial' ); + } + $this->fps[ $original_request_id ] = fopen( $this->output_paths[ $original_request_id ] . '.partial', 'wb' ); + if ( false === $this->fps[ $original_request_id ] ) { + // @TODO: Log an error. + } } - $this->fps[ $original_request_id ] = fopen( $this->output_paths[ $original_request_id ] . '.partial', 'wb' ); - if ( false === $this->fps[ $original_request_id ] ) { + break; + case Client::EVENT_BODY_CHUNK_AVAILABLE: + $chunk = $this->client->get_response_body_chunk(); + if ( false === fwrite( $this->fps[ $original_request_id ], $chunk ) ) { // @TODO: Log an error. } - } - break; - case Client::EVENT_BODY_CHUNK_AVAILABLE: - $chunk = $this->client->get_response_body_chunk(); - if ( false === fwrite( $this->fps[ $original_request_id ], $chunk ) ) { - // @TODO: Log an error. - } - break; - case Client::EVENT_FAILED: - if ( isset( $this->fps[ $original_request_id ] ) ) { - fclose( $this->fps[ $original_request_id ] ); - } - if ( isset( $this->partial_files[ $original_request_id ] ) ) { - $partial_file = $this->output_root . '/' . $this->partial_files[ $original_request_id ] . '.partial'; - if ( file_exists( $partial_file ) ) { - unlink( $partial_file ); - } - } - unset( $this->output_paths[ $original_request_id ] ); - break; - case Client::EVENT_FINISHED: - if ( ! $request->is_redirected() ) { - // Only clean up if this was the last request in the chain. + break; + case Client::EVENT_FAILED: if ( isset( $this->fps[ $original_request_id ] ) ) { fclose( $this->fps[ $original_request_id ] ); } - if ( isset( $this->output_paths[ $original_request_id ] ) && isset( $this->partial_files[ $original_request_id ] ) ) { - if ( false === rename( - $this->partial_files[ $original_request_id ], - $this->output_paths[ $original_request_id ] - ) ) { - // @TODO: Log an error. + if ( isset( $this->output_paths[ $original_request_id ] ) ) { + $partial_file = $this->output_root . '/' . $this->output_paths[ $original_request_id ] . '.partial'; + if ( file_exists( $partial_file ) ) { + unlink( $partial_file ); } } - unset( $this->partial_files[ $original_request_id ] ); - } - break; + $this->pending_events[] = new WP_Attachment_Downloader_Event( + 'http:' . $original_request_id, + WP_Attachment_Downloader_Event::FAILURE + ); + unset( $this->output_paths[ $original_request_id ] ); + break; + case Client::EVENT_FINISHED: + if ( ! $request->is_redirected() ) { + // Only clean up if this was the last request in the chain. + if ( isset( $this->fps[ $original_request_id ] ) ) { + fclose( $this->fps[ $original_request_id ] ); + } + if ( isset( $this->output_paths[ $original_request_id ] ) ) { + if ( false === rename( + $this->output_paths[ $original_request_id ] . '.partial', + $this->output_paths[ $original_request_id ] + ) ) { + // @TODO: Log an error. + } + } + $this->pending_events[] = new WP_Attachment_Downloader_Event( + 'http:' . $original_request_id, + WP_Attachment_Downloader_Event::SUCCESS + ); + unset( $this->output_paths[ $original_request_id ] ); + } + break; + } } return true; diff --git a/packages/playground/data-liberation/src/import/WP_Attachment_Downloader_Event.php b/packages/playground/data-liberation/src/import/WP_Attachment_Downloader_Event.php new file mode 100644 index 0000000000..b759ebb8fb --- /dev/null +++ b/packages/playground/data-liberation/src/import/WP_Attachment_Downloader_Event.php @@ -0,0 +1,15 @@ +resource_id = $resource_id; + $this->type = $type; + } +} diff --git a/packages/playground/data-liberation/src/import/WP_Markdown_Importer.php b/packages/playground/data-liberation/src/import/WP_Markdown_Importer.php index f4551a35a0..e76e6d643f 100644 --- a/packages/playground/data-liberation/src/import/WP_Markdown_Importer.php +++ b/packages/playground/data-liberation/src/import/WP_Markdown_Importer.php @@ -2,12 +2,14 @@ class WP_Markdown_Importer extends WP_Stream_Importer { - public static function create( - $entity_iterator_factory, - $options = array() - ) { - $options = static::parse_options( $options ); - return new WP_Markdown_Importer( $entity_iterator_factory, $options ); + public static function create_for_markdown_directory( $markdown_directory, $options = array(), $cursor = null ) { + return static::create( + function ( $cursor = null ) use ( $markdown_directory ) { + return WP_Markdown_Importer::create( $markdown_directory, $cursor ); + }, + $options, + $cursor + ); } protected static function parse_options( $options ) { diff --git a/packages/playground/data-liberation/src/import/WP_Stream_Importer.php b/packages/playground/data-liberation/src/import/WP_Stream_Importer.php index 5258fb9cae..cc4ea76c30 100644 --- a/packages/playground/data-liberation/src/import/WP_Stream_Importer.php +++ b/packages/playground/data-liberation/src/import/WP_Stream_Importer.php @@ -1,4 +1,8 @@ tag. */ - protected $source_site_url; + private $source_site_url; private $entity_iterator_factory; /** * @param array|string|null $query { @@ -38,18 +43,90 @@ class WP_Stream_Importer { * after the import. E.g. http://127.0.0.1:9400/wp-content/uploads/ * } */ - protected $options; - protected $downloader; + private $options; + + const STAGE_INITIAL = '#initial'; + const STAGE_TOPOLOGICAL_SORT = '#topological_sort'; + const STAGE_FRONTLOAD_ASSETS = '#frontload_assets'; + const STAGE_IMPORT_ENTITIES = '#import_entities'; + const STAGE_FINISHED = '#finished'; + + /** + * The current state of the import process. + * @var string + */ + private $stage = self::STAGE_INITIAL; + + /** + * Iterator that streams entities to import. + */ + private $entity_iterator; + private $resume_at_entity; + /** + * A map of currently downloaded resources for each entity in + * the following format: + * + * [$entity_cursor => [$resource_id => true]] + * + * @var array
---|