From de519654fc98b69fd9ff3741b82970f1faedc72a Mon Sep 17 00:00:00 2001 From: neohipper Date: Mon, 29 Jan 2018 16:30:49 +0100 Subject: [PATCH 01/12] Always update last_modified --- includes/batch.inc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/includes/batch.inc b/includes/batch.inc index b2b2d9b..7df1053 100644 --- a/includes/batch.inc +++ b/includes/batch.inc @@ -108,7 +108,7 @@ function islandora_xmlsitemap_batch_operation($jump, $cutoff, &$context) { '@total' => $sandbox['total'], )); $context['finished'] = $sandbox['offset'] / $sandbox['total']; - if ($context['finished'] >= 1) { - variable_set('islandora_xmlsitemap_last_modified_value', $sandbox['last_modified']); - } + + // Always update last_modified + variable_set('islandora_xmlsitemap_last_modified_value', $sandbox['last_modified']); } From 036584f18cba24b5935ccbdecc808df4acc7a43a Mon Sep 17 00:00:00 2001 From: Jan Jouke Tjalsma Date: Mon, 5 Feb 2018 12:44:34 +0100 Subject: [PATCH 02/12] Finish rewrite --- README.md | 12 +- includes/admin.form.inc | 25 ++- includes/batch.inc | 293 +++++++++++++++++++++++++-------- islandora_xmlsitemap.drush.inc | 37 +++++ islandora_xmlsitemap.module | 6 +- 5 files changed, 296 insertions(+), 77 deletions(-) create mode 100644 islandora_xmlsitemap.drush.inc diff --git a/README.md b/README.md index cf8b0b5..24ef193 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Install as usual, see [this](https://drupal.org/documentation/install/modules-th Set 'Last Modified Solr Field' and 'Maximum number of Islandora links to process at once' in Administration » Islandora » XML Sitemap Integration (admin/islandora/xmlsitemap). -![Configuration](https://camo.githubusercontent.com/407972e0a2c14bafd74924992c659021b800abb0/687474703a2f2f692e696d6775722e636f6d2f455a534f4b68372e706e67) +![Configuration](https://user-images.githubusercontent.com/2461961/35802085-ffafc758-0a6e-11e8-8a0c-e1f09e4d2a45.png) ### Notes @@ -48,6 +48,16 @@ Please also note that objects marked as "inactive", whether manually or by using Larger sites with greater than 100,000 objects may encounter issues during the sitemap building process with the default configuration, such as the process hanging around a specific number indefinitely or exiting the process entirely before completion. These users may want to try unchecking the "Prefetch URL aliases during sitemap generation" option found on the xmlsitemap admin configuration page (/admin/config/search/xmlsitemap/settings) and trying the process again. +## Bulk generation using drush +There is a Drush command available for the generation of sitemap links. This command allows you to optionally fetch a limited amount of objects similar to the hook_cron() and lets you define a custom amount to be fetched at once from SOLR. + +Command: +`drush islandora_xmlsitemap_generate [max_chunk_size] [limit] [--regenerate]` + +* `Max chunk size` defaults to 100 +* If no `limit` is set all objects will be processed +* The `--regenerate` flag removes the "last_modified" value so will cause processing to start at the beginning + ## Documentation This module's documentation is also available at [our wiki](https://wiki.duraspace.org/display/ISLANDORA/Islandora+XML+Sitemap). diff --git a/includes/admin.form.inc b/includes/admin.form.inc index 4fe19d3..7e2560e 100644 --- a/includes/admin.form.inc +++ b/includes/admin.form.inc @@ -34,13 +34,22 @@ function islandora_xmlsitemap_admin_form($form, &$form_state) { '#description' => t('Solr field in which we can perform sorting and range queries.'), '#default_value' => variable_get('islandora_xmlsitemap_last_modified_field', 'fgs_lastModifiedDate_dt'), ); - $form['islandora_xmlsitemap_number_of_pids_to_process'] = array( + $form['islandora_xmlsitemap_generate_limit_cron'] = array( '#type' => 'textfield', - '#title' => 'Maximum Number of Islandora links to process at once', + '#title' => 'Maximum Number of Islandora links to process during hook_cron()', '#size' => 10, '#element_validate' => array('element_validate_integer_positive'), - '#default_value' => variable_get('islandora_xmlsitemap_number_of_pids_to_process', 1000), - '#description' => 'This is the number of Islandora/Fedora links we will process at once', + '#default_value' => variable_get('islandora_xmlsitemap_generate_limit_cron', 1000), + '#description' => 'This is the maximum number of Islandora/Fedora links we will automatically process when hook_cron() is called', + ); + + $form['islandora_xmlsitemap_generate_chunk_size'] = array( + '#type' => 'textfield', + '#title' => 'Amount of Islandora links to fetch and process at once', + '#size' => 10, + '#element_validate' => array('element_validate_integer_positive'), + '#default_value' => variable_get('islandora_xmlsitemap_generate_chunk_size', 100), + '#description' => 'This is the number of Islandora/Fedora links we will fetch and process at once (before updating the Last Modified value) when using the buttons below or when hook_cron() is called', ); $form['actions'] = array( @@ -74,7 +83,8 @@ function islandora_xmlsitemap_admin_form_submit(&$form, &$form_state) { $to_set = array( 'islandora_xmlsitemap_last_modified_field', - 'islandora_xmlsitemap_number_of_pids_to_process', + 'islandora_xmlsitemap_generate_limit_cron', + 'islandora_xmlsitemap_generate_chunk_size' ); if ($button == 'generate' || $button == 'regenerate') { if ($button == 'regenerate') { @@ -82,7 +92,10 @@ function islandora_xmlsitemap_admin_form_submit(&$form, &$form_state) { } module_load_include('inc', 'islandora_xmlsitemap', 'includes/batch'); - $batch = islandora_xmlsitemap_get_batch(100, -1); + $batch = islandora_xmlsitemap_get_batch( + variable_get('islandora_xmlsitemap_generate_chunk_size', 100), + NULL + ); batch_set($batch); } elseif ($button == 'submit') { diff --git a/includes/batch.inc b/includes/batch.inc index 7df1053..c16d6ea 100644 --- a/includes/batch.inc +++ b/includes/batch.inc @@ -1,5 +1,4 @@ t('Time elapsed: @elapsed
Estimated time remaining @estimate.'), 'operations' => array( - array('islandora_xmlsitemap_batch_operation', array($jump, $cutoff)), + // Using a single operation that will be called multiple times + array('islandora_xmlsitemap_batch_operation', array($max_chunk_size, $batch_limit)), ), 'file' => "$mod_path/includes/batch.inc", + 'finished' => 'islandora_xmlsitemap_finished' ); return $batch; } @@ -36,79 +36,234 @@ function islandora_xmlsitemap_get_batch($jump = 100, $cutoff = NULL) { /** * Batch operation. * - * Grab a number of records from Solr at a time, until we hit the cutoff - * number. + * Grab a number of records from Solr at a time + * + * This operation will be called multiple times until batch limit is reached or + * there are no records newer than the last_modified date that was defined + * + * Gets variables: + * - islandora_xmlsitemap_last_modified_value (starting point for query) + * - islandora_xmlsitemap_last_modified_field (field to look for + * last_modified_value to match against) + * - islandora_namespace_restriction_enforced + * + * Sets variable: + * - islandora_xmlsitemap_last_modified_value + * + * @param int $chunk_size + * The number of records to grab each iteration of the operation + * + * @param int $batch_limit + * The number of records to grab before ending the batch process * - * @param int $jump - * The number of records to grab each iteration. - * @param int $cutoff - * The number of records to process per batch. + * @param array &$context Reference to persistent context of the operation */ -function islandora_xmlsitemap_batch_operation($jump, $cutoff, &$context) { +function islandora_xmlsitemap_batch_operation( + $max_chunk_size = 100, + $batch_limit = null, + &$context +){ + // Create sandbox alias $sandbox =& $context['sandbox']; - if (!isset($sandbox['offset'])) { - $sandbox['offset'] = 0; - - $sandbox['last_modified'] = variable_get('islandora_xmlsitemap_last_modified_value', NULL); - - $date_field = variable_get('islandora_xmlsitemap_last_modified_field', 'fgs_lastModifiedDate_dt'); - - $qp = $sandbox['query_processor'] = new IslandoraSolrQueryProcessor(); - $qp->solrLimit = $cutoff > 0 ? - min($jump, $cutoff) : - $jump; - $qp->solrQuery = isset($sandbox['last_modified']) ? - "$date_field:[{$sandbox['last_modified']} TO *]" : - "$date_field:[* TO *]"; - $qp->solrParams['sort'] = "$date_field asc"; - $qp->solrParams['fq'] = array(); - $qp->solrParams['fl'] = 'PID'; - - $namespaces_enforced = variable_get('islandora_namespace_restriction_enforced', FALSE); - if ($namespaces_enforced) { - $namespace_map = function ($namespace) { - return 'PID:' . Apache_Solr_Service::escape("$namespace:") . '*'; - }; - module_load_include('inc', 'islandora', 'includes/utilities'); - $qp->solrParams['fq'][] = implode(' OR ', array_map($namespace_map, islandora_get_allowed_namespaces())); + /* + * Sandbox setup (only on first call of operation) + */ + if (empty($sandbox)) { + // Setup the sandbox + $sandbox = array(); + + // Prepare progress indicators + $sandbox['progress'] = 0;// The amount of rows processed + $sandbox['current_node'] = "None";// The PID of the latest processed row + $sandbox['total'] = null;// Represents total to be retrieved during this batch process. Set after initial results are retrieved + + // Set a batch limit if specified + if (isset($batch_limit)) { + $sandbox['batch_limit'] = (int) $batch_limit; } + + // Boostrap an IslandoraSolrQueryProcessor for our purposes + $last_modified = $sandbox['last_modified'] = variable_get('islandora_xmlsitemap_last_modified_value', NULL); + $last_modified_field = variable_get('islandora_xmlsitemap_last_modified_field', 'fgs_lastModifiedDate_dt'); + $enforce_namespaces = variable_get('islandora_namespace_restriction_enforced', FALSE); + $sandbox['queryProcessor'] = islandora_xmlsitemap_queryProcessor_create($last_modified_field, $last_modified, $enforce_namespaces); + + // Set maximum chunk size + $sandbox['max_chunk_size'] = (int) $max_chunk_size; + + // Set default SOLR limit + $sandbox['queryProcessor']->solrLimit = $sandbox['max_chunk_size']; } - else { - $sandbox['offset'] += $jump; - $qp = $sandbox['query_processor']; + + /* + * Iteration start + */ + + // Set SOLR offset to where we left off in the previous iteration + $sandbox['queryProcessor']->solrStart = $sandbox['progress']; + + // Set chunk size for limited batches + // This prevents overshooting the batch_limit + if (isset($sandbox['batch_limit'])) { + // Max chunk size for limited batches + $sandbox['max_chunk_size'] = min($sandbox['batch_limit'] - $sandbox['progress'], $sandbox['batch_limit']); + + // Update SOLR limit + $sandbox['queryProcessor']->solrLimit = $sandbox['max_chunk_size']; } - $qp->solrStart = $sandbox['offset']; + // Execute SOLR query and get result + $sandbox['queryProcessor']->executeQuery(FALSE, TRUE); + $result = $sandbox['queryProcessor']->islandoraSolrResult; + $resultCount = (int) $result['response']['numFound']; - // Query for $count PIDs, starting from $offset. - $qp->executeQuery(FALSE, TRUE); - $results = $qp->islandoraSolrResult; + // Catch empty result + if ($result === NULL || $resultCount === 0) { + // Display error message + $context['message'] = t('No results or SOLR error'); - $sandbox['total'] = $cutoff > 0 ? - min((int) $results['response']['numFound'], $cutoff) : - (int) $results['response']['numFound']; - if ($results === NULL || $sandbox['total'] === 0) { - $context['message'] = t('No results selected, or errored...'); - // Stash the most current value, so we can hopefully pick up where we left - // off last time. - variable_set('islandora_xmlsitemap_last_modified_value', $sandbox['last_modified']); + // Add error to the log + $context['results'][] = $context['message']; + + // Set finished to 1 so the batch process will not loop endlessly + $context['finished'] = 1; + + // Return control to batch engine return; } - // Add/update each result in the custom table. + + // Set total based on results (only once every batch) + if (!isset($sandbox['total'])) { + if (isset($sandbox['batch_limit'])) { + // Limited batch total is batch_limit or resultcount if there are less + // results than the batch_limit + $sandbox['total'] = min($resultCount, $sandbox['batch_limit']); + }else{ + // Unlimited batch total is equal to the number of results + $sandbox['total'] = $resultCount; + } + } + + // Process result rows module_load_include('inc', 'islandora_xmlsitemap', 'includes/utilities'); - foreach ($results['response']['objects'] as $result) { - islandora_xmlsitemap_add_or_update_link($result['PID'], $sandbox); + foreach ($result['response']['objects'] as $row) { + // Add or Update link + islandora_xmlsitemap_add_or_update_link($row['PID'], $sandbox); + + // Update our progress information. + $sandbox['progress']++; + $sandbox['current_node'] = $row['PID']; + + // Log result for post-processing in the 'finished' callback. + $context['results'][] = "Add/update ".$sandbox['current_node']; } - $qp->resetResults(); + //Drop results when finished with them + $sandbox['queryProcessor']->resetResults(); - $context['message'] = t('Processed @count of @total.', array( - '@count' => min($sandbox['offset'] + $jump, $sandbox['total']), + // Set message to be returned + $context['message'] = t('Processed @count of @total. Current node: @pid.', array( + '@count' => $sandbox['progress'], '@total' => $sandbox['total'], + '@pid' => $sandbox['current_node'] )); - $context['finished'] = $sandbox['offset'] / $sandbox['total']; - - // Always update last_modified - variable_set('islandora_xmlsitemap_last_modified_value', $sandbox['last_modified']); + + // Persist the last processed records last modified date + // This allows facilitates continuing from this point onward next time in case + // not all records have been processed due to limits set or a crash + if (isset($sandbox['last_modified'])) { + variable_set('islandora_xmlsitemap_last_modified_value', $sandbox['last_modified']); + } + + // Inform the batch engine of our progress, + // and provide an estimation of the completion level we reached. + $context['finished'] = $sandbox['progress'] / $sandbox['total']; +} + +/** + * Bootstraps a new queryprocessor for Islandora XMLsitemap + * + * @param string $last_modified_field + * The date field to use for sorting and filtering + * + * @param string $last_modified + * The last modified date + * + * @param boolean $enforce_namespaces + * Enforce Islandora namespaces or not. If enabled allowed namespaces will be + * retrieved using islandora_get_allowed_namespaces(). Defaults to FALSE + * + * @return IslandoraSolrQueryProcessor Bootstrapped SOLR query processor + */ +function islandora_xmlsitemap_queryProcessor_create($last_modified_field, $last_modified = NULL, $enforce_namespaces = FALSE) { + // Instantiate Islandora Query Processor + $qp = new IslandoraSolrQueryProcessor(); + + // Set sorting on the $last_modified_field field, ascending so we go from older to newer records + $qp->solrParams['sort'] = "$last_modified_field asc"; + + // Set empty filter query + $qp->solrParams['fq'] = array(); + + // Set or update the latest last_modified value + $qp->solrQuery = islandora_xmlsitemap_queryProcessor_last_modified_query( + $last_modified_field, + $last_modified + ); + + // Return only the PID field + $qp->solrParams['fl'] = 'PID'; + + // Enforce namespace if required by settings + if ($enforce_namespaces) { + $namespace_map = function ($namespace) { + return 'PID:' . Apache_Solr_Service::escape("$namespace:") . '*'; + }; + module_load_include('inc', 'islandora', 'includes/utilities'); + $qp->solrParams['fq'][] = implode(' OR ', array_map($namespace_map, islandora_get_allowed_namespaces())); + } + + return $qp; +} + +/** + * Creates a SOLR query string based on the last modified value + * + * @param string $last_modified_field Last modified date field + * @param string $last_modified Last modified date + * + * @return string SOLR query string + */ +function islandora_xmlsitemap_queryProcessor_last_modified_query($last_modified_field, $last_modified = NULL) { + if (isset($last_modified)) { + return "$last_modified_field:[{$last_modified} TO *]"; + } + return "$last_modified_field:[* TO *]"; +} + +/** + * Finished callback + * + * Displays result or errors when the batch finishes + */ +function islandora_xmlsitemap_finished($success, $results, $operations) { + if ($success) { + // Display result + drupal_set_message(t('@count results processed', array('@count' => count($results)))); + drupal_set_message(t('The last result was "%final"', array('%final' => end($results)))); + } + else { + // Display error + drupal_set_message( + t( + 'An error occurred while processing. The last result was: %final. Batch arguments: @args', + array( + '%final' => end($results), + '@args' => print_r($error_operation[0], TRUE), + ) + ), + 'error' + ); + } } diff --git a/islandora_xmlsitemap.drush.inc b/islandora_xmlsitemap.drush.inc new file mode 100644 index 0000000..8925b5d --- /dev/null +++ b/islandora_xmlsitemap.drush.inc @@ -0,0 +1,37 @@ + 'Generate XMLsitemap records.', + 'arguments' => array( + 'max_chunk_size' => 'The number of records to grab each SOLR request', + 'limit' => 'The total number of records to grab before ending the process' + ), + 'options' => array( + 'regenerate' => 'If set will clear the last_modified date before generating.' + ), + ); + return $items; +} + +/** + * implements drush_hook_COMMAND_validate() + */ +function drush_islandora_xmlsitemap_generate($max_chunk_size = 100, $limit = null) { + // Remove last_modified value if we need to regenerate + $regenerate = drush_get_option('regenerate', NULL); + if (isset($regenerate)) { + variable_del('islandora_xmlsitemap_last_modified_value'); + } + + // Start the process + module_load_include('inc', 'islandora_xmlsitemap', 'includes/batch'); + xmlsitemap_run_unprogressive_batch('islandora_xmlsitemap_get_batch', $max_chunk_size, $limit); +} diff --git a/islandora_xmlsitemap.module b/islandora_xmlsitemap.module index aac7c90..cf74a18 100644 --- a/islandora_xmlsitemap.module +++ b/islandora_xmlsitemap.module @@ -10,7 +10,11 @@ define('ISLANDORA_XMLSITEMAP_PATH_PREFIX', 'islandora/object/'); */ function islandora_xmlsitemap_cron() { module_load_include('inc', 'islandora_xmlsitemap', 'includes/batch'); - xmlsitemap_run_unprogressive_batch('islandora_xmlsitemap_get_batch'); + xmlsitemap_run_unprogressive_batch( + 'islandora_xmlsitemap_get_batch', + variable_get('islandora_xmlsitemap_generate_chunk_size', 100), + variable_get('islandora_xmlsitemap_generate_limit_cron', 1000) + ); } /** From e76e2e16a5076f5aca3431348cb664f51b3c59fc Mon Sep 17 00:00:00 2001 From: Jan Jouke Tjalsma Date: Thu, 8 Feb 2018 09:01:56 +0100 Subject: [PATCH 03/12] Better use of coding standards --- includes/batch.inc | 135 ++++++++++++++++----------------- islandora_xmlsitemap.drush.inc | 11 ++- 2 files changed, 72 insertions(+), 74 deletions(-) diff --git a/includes/batch.inc b/includes/batch.inc index c16d6ea..f8acaf4 100644 --- a/includes/batch.inc +++ b/includes/batch.inc @@ -8,18 +8,15 @@ * Get the our basic batch structure. * * @param int $chunk_size - * The number of records to grab each iteration of the operation + * The number of records to grab each iteration of the operation. * * @param int $batch_limit - * The number of records to grab before ending the batch process + * The number of records to grab before ending the batch process. * * @return array * An array representing the batch. */ -function islandora_xmlsitemap_get_batch( - $max_chunk_size = 100, - $batch_limit = NULL -){ +function islandora_xmlsitemap_get_batch($max_chunk_size = 100, $batch_limit = NULL){ $mod_path = drupal_get_path('module', 'islandora_xmlsitemap'); $batch = array( 'progress_message' => t('Time elapsed: @elapsed
Estimated time remaining @estimate.'), @@ -34,12 +31,10 @@ function islandora_xmlsitemap_get_batch( } /** - * Batch operation. - * - * Grab a number of records from Solr at a time + * Batch operation. Grab a number of records from Solr at a time. * * This operation will be called multiple times until batch limit is reached or - * there are no records newer than the last_modified date that was defined + * there are no records newer than the last_modified date that was defined. * * Gets variables: * - islandora_xmlsitemap_last_modified_value (starting point for query) @@ -51,105 +46,105 @@ function islandora_xmlsitemap_get_batch( * - islandora_xmlsitemap_last_modified_value * * @param int $chunk_size - * The number of records to grab each iteration of the operation + * The number of records to grab each iteration of the operation. * * @param int $batch_limit - * The number of records to grab before ending the batch process + * The number of records to grab before ending the batch process. * - * @param array &$context Reference to persistent context of the operation + * @param array &$context + * Reference to persistent context of the operation. */ -function islandora_xmlsitemap_batch_operation( - $max_chunk_size = 100, - $batch_limit = null, - &$context -){ +function islandora_xmlsitemap_batch_operation($max_chunk_size = 100, $batch_limit = null, &$context){ // Create sandbox alias $sandbox =& $context['sandbox']; /* - * Sandbox setup (only on first call of operation) + * Sandbox setup (only on first call of operation). */ if (empty($sandbox)) { - // Setup the sandbox $sandbox = array(); - // Prepare progress indicators - $sandbox['progress'] = 0;// The amount of rows processed - $sandbox['current_node'] = "None";// The PID of the latest processed row - $sandbox['total'] = null;// Represents total to be retrieved during this batch process. Set after initial results are retrieved + // The amount of rows processed. + $sandbox['progress'] = 0; + + // The PID of the latest processed row. + $sandbox['current_node'] = "None"; + + // Represents total to be retrieved during this batch process. Set after initial results are retrieved. + $sandbox['total'] = null; // Set a batch limit if specified if (isset($batch_limit)) { $sandbox['batch_limit'] = (int) $batch_limit; } - // Boostrap an IslandoraSolrQueryProcessor for our purposes + // Boostrap an IslandoraSolrQueryProcessor for our purposes. $last_modified = $sandbox['last_modified'] = variable_get('islandora_xmlsitemap_last_modified_value', NULL); $last_modified_field = variable_get('islandora_xmlsitemap_last_modified_field', 'fgs_lastModifiedDate_dt'); $enforce_namespaces = variable_get('islandora_namespace_restriction_enforced', FALSE); $sandbox['queryProcessor'] = islandora_xmlsitemap_queryProcessor_create($last_modified_field, $last_modified, $enforce_namespaces); - // Set maximum chunk size + // Set maximum chunk size. $sandbox['max_chunk_size'] = (int) $max_chunk_size; - // Set default SOLR limit + // Set default SOLR limit. $sandbox['queryProcessor']->solrLimit = $sandbox['max_chunk_size']; } /* - * Iteration start + * Iteration start. */ - // Set SOLR offset to where we left off in the previous iteration + // Set SOLR offset to where we left off in the previous iteration. $sandbox['queryProcessor']->solrStart = $sandbox['progress']; - // Set chunk size for limited batches - // This prevents overshooting the batch_limit + // Set chunk size for limited batches. This prevents overshooting + // the batch_limit. if (isset($sandbox['batch_limit'])) { - // Max chunk size for limited batches + // Max chunk size for limited batches. $sandbox['max_chunk_size'] = min($sandbox['batch_limit'] - $sandbox['progress'], $sandbox['batch_limit']); - // Update SOLR limit + // Update SOLR limit. $sandbox['queryProcessor']->solrLimit = $sandbox['max_chunk_size']; } - // Execute SOLR query and get result + // Execute SOLR query and get result. $sandbox['queryProcessor']->executeQuery(FALSE, TRUE); $result = $sandbox['queryProcessor']->islandoraSolrResult; $resultCount = (int) $result['response']['numFound']; - // Catch empty result + // Catch empty result. if ($result === NULL || $resultCount === 0) { - // Display error message + // Display error message. $context['message'] = t('No results or SOLR error'); - // Add error to the log + // Add error to the log. $context['results'][] = $context['message']; - // Set finished to 1 so the batch process will not loop endlessly + // Set finished to 1 so the batch process will not loop endlessly. $context['finished'] = 1; - // Return control to batch engine + // Return control to batch engine. return; } - - // Set total based on results (only once every batch) + // Set total based on results (only once every batch). if (!isset($sandbox['total'])) { if (isset($sandbox['batch_limit'])) { // Limited batch total is batch_limit or resultcount if there are less - // results than the batch_limit + // results than the batch_limit. $sandbox['total'] = min($resultCount, $sandbox['batch_limit']); }else{ - // Unlimited batch total is equal to the number of results + // Unlimited batch total is equal to the number of results. $sandbox['total'] = $resultCount; } } - // Process result rows + // Process result rows. module_load_include('inc', 'islandora_xmlsitemap', 'includes/utilities'); - foreach ($result['response']['objects'] as $row) { - // Add or Update link + $resultRows = $result['response']['objects']; + foreach ($resultRows as $row) { + // Add or Update link. islandora_xmlsitemap_add_or_update_link($row['PID'], $sandbox); // Update our progress information. @@ -159,19 +154,19 @@ function islandora_xmlsitemap_batch_operation( // Log result for post-processing in the 'finished' callback. $context['results'][] = "Add/update ".$sandbox['current_node']; } - //Drop results when finished with them + //Drop results when finished with them. $sandbox['queryProcessor']->resetResults(); - // Set message to be returned + // Set message to be returned. $context['message'] = t('Processed @count of @total. Current node: @pid.', array( '@count' => $sandbox['progress'], '@total' => $sandbox['total'], '@pid' => $sandbox['current_node'] )); - // Persist the last processed records last modified date + // Persist the last processed records last modified date. // This allows facilitates continuing from this point onward next time in case - // not all records have been processed due to limits set or a crash + // not all records have been processed due to limits set or a crash. if (isset($sandbox['last_modified'])) { variable_set('islandora_xmlsitemap_last_modified_value', $sandbox['last_modified']); } @@ -182,40 +177,40 @@ function islandora_xmlsitemap_batch_operation( } /** - * Bootstraps a new queryprocessor for Islandora XMLsitemap + * Bootstraps a new queryprocessor for Islandora XMLsitemap. * * @param string $last_modified_field - * The date field to use for sorting and filtering + * The date field to use for sorting and filtering. * * @param string $last_modified - * The last modified date + * The last modified date. * * @param boolean $enforce_namespaces * Enforce Islandora namespaces or not. If enabled allowed namespaces will be - * retrieved using islandora_get_allowed_namespaces(). Defaults to FALSE + * retrieved using islandora_get_allowed_namespaces(). Defaults to FALSE. * - * @return IslandoraSolrQueryProcessor Bootstrapped SOLR query processor + * @return IslandoraSolrQueryProcessor Bootstrapped SOLR query processor. */ function islandora_xmlsitemap_queryProcessor_create($last_modified_field, $last_modified = NULL, $enforce_namespaces = FALSE) { - // Instantiate Islandora Query Processor + // Instantiate Islandora Query Processor. $qp = new IslandoraSolrQueryProcessor(); - // Set sorting on the $last_modified_field field, ascending so we go from older to newer records + // Set sorting on the $last_modified_field field, ascending so we go from older to newer records. $qp->solrParams['sort'] = "$last_modified_field asc"; - // Set empty filter query + // Set empty filter query. $qp->solrParams['fq'] = array(); - // Set or update the latest last_modified value + // Set or update the latest last_modified value. $qp->solrQuery = islandora_xmlsitemap_queryProcessor_last_modified_query( $last_modified_field, $last_modified ); - // Return only the PID field + // Return only the PID field. $qp->solrParams['fl'] = 'PID'; - // Enforce namespace if required by settings + // Enforce namespace if required by settings. if ($enforce_namespaces) { $namespace_map = function ($namespace) { return 'PID:' . Apache_Solr_Service::escape("$namespace:") . '*'; @@ -228,12 +223,12 @@ function islandora_xmlsitemap_queryProcessor_create($last_modified_field, $last_ } /** - * Creates a SOLR query string based on the last modified value + * Creates a SOLR query string based on the last modified value. * - * @param string $last_modified_field Last modified date field - * @param string $last_modified Last modified date + * @param string $last_modified_field Last modified date field. + * @param string $last_modified Last modified date. * - * @return string SOLR query string + * @return string SOLR query string. */ function islandora_xmlsitemap_queryProcessor_last_modified_query($last_modified_field, $last_modified = NULL) { if (isset($last_modified)) { @@ -243,24 +238,24 @@ function islandora_xmlsitemap_queryProcessor_last_modified_query($last_modified_ } /** - * Finished callback + * Finished callback. * - * Displays result or errors when the batch finishes + * Displays result or errors when the batch finishes. */ function islandora_xmlsitemap_finished($success, $results, $operations) { if ($success) { - // Display result + // Display the result. drupal_set_message(t('@count results processed', array('@count' => count($results)))); drupal_set_message(t('The last result was "%final"', array('%final' => end($results)))); } else { - // Display error + // Display an error. drupal_set_message( t( 'An error occurred while processing. The last result was: %final. Batch arguments: @args', array( '%final' => end($results), - '@args' => print_r($error_operation[0], TRUE), + '@args' => print_r($error_operation[0], TRUE) ) ), 'error' diff --git a/islandora_xmlsitemap.drush.inc b/islandora_xmlsitemap.drush.inc index 8925b5d..6448869 100644 --- a/islandora_xmlsitemap.drush.inc +++ b/islandora_xmlsitemap.drush.inc @@ -1,11 +1,14 @@ Date: Thu, 8 Feb 2018 09:21:50 +0100 Subject: [PATCH 04/12] Differentiate between error conditions --- includes/batch.inc | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/includes/batch.inc b/includes/batch.inc index f8acaf4..dcc3d3b 100644 --- a/includes/batch.inc +++ b/includes/batch.inc @@ -113,12 +113,28 @@ function islandora_xmlsitemap_batch_operation($max_chunk_size = 100, $batch_limi $result = $sandbox['queryProcessor']->islandoraSolrResult; $resultCount = (int) $result['response']['numFound']; - // Catch empty result. - if ($result === NULL || $resultCount === 0) { - // Display error message. - $context['message'] = t('No results or SOLR error'); + // Result will be null in case of SOLR error. The queryprocessor will have set + // an error using drupal_set_message. + $hasSolrError = ($result === NULL); + + // Catch a result that does not have any rows + $noResults = ($resultCount === 0); + + // Check if we encountered error conditions + if ($hasSolrError || $noResults) { + // SOLR error + if ($hasSolrError) { + // Set message about error. + $context['message'] = t('SOLR error'); + } - // Add error to the log. + // No results + if ($noResults) { + // Set message about error. + $context['message'] = t('No results'); + } + + // Add error to the result list. $context['results'][] = $context['message']; // Set finished to 1 so the batch process will not loop endlessly. @@ -126,6 +142,13 @@ function islandora_xmlsitemap_batch_operation($max_chunk_size = 100, $batch_limi // Return control to batch engine. return; + } + + ) { + // Display error message. + $context['message'] = t('No results or SOLR error'); + + } // Set total based on results (only once every batch). @@ -245,7 +268,7 @@ function islandora_xmlsitemap_queryProcessor_last_modified_query($last_modified_ function islandora_xmlsitemap_finished($success, $results, $operations) { if ($success) { // Display the result. - drupal_set_message(t('@count results processed', array('@count' => count($results)))); + drupal_set_message(t('@count results', array('@count' => count($results)))); drupal_set_message(t('The last result was "%final"', array('%final' => end($results)))); } else { From 5a3c9eae102ac6a5731af4e16589f2df34fa6c79 Mon Sep 17 00:00:00 2001 From: Jan Jouke Tjalsma Date: Thu, 8 Feb 2018 11:10:22 +0100 Subject: [PATCH 05/12] Improve docblock --- islandora_xmlsitemap.drush.inc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/islandora_xmlsitemap.drush.inc b/islandora_xmlsitemap.drush.inc index 6448869..c671e89 100644 --- a/islandora_xmlsitemap.drush.inc +++ b/islandora_xmlsitemap.drush.inc @@ -5,6 +5,8 @@ */ /** + * Register the islandora_xmlsitemap Drush commands + * * Implements hook_drush_command(). * * @return array From 05ad7373323e0fffed880b0470f26bbb9f1b71f7 Mon Sep 17 00:00:00 2001 From: Jan Jouke Tjalsma Date: Thu, 8 Feb 2018 12:38:05 +0100 Subject: [PATCH 06/12] Move arguments to options and add validation of all options --- README.md | 2 +- islandora_xmlsitemap.drush.inc | 62 +++++++++++++++++++++++++++++----- 2 files changed, 55 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 24ef193..cf497e9 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ Larger sites with greater than 100,000 objects may encounter issues during the s There is a Drush command available for the generation of sitemap links. This command allows you to optionally fetch a limited amount of objects similar to the hook_cron() and lets you define a custom amount to be fetched at once from SOLR. Command: -`drush islandora_xmlsitemap_generate [max_chunk_size] [limit] [--regenerate]` +`drush islandora_xmlsitemap_generate [--max_chunk_size=100] [limit=1000] [--regenerate]` * `Max chunk size` defaults to 100 * If no `limit` is set all objects will be processed diff --git a/islandora_xmlsitemap.drush.inc b/islandora_xmlsitemap.drush.inc index c671e89..24f567e 100644 --- a/islandora_xmlsitemap.drush.inc +++ b/islandora_xmlsitemap.drush.inc @@ -15,21 +15,54 @@ function islandora_xmlsitemap_drush_command() { $items['islandora_xmlsitemap_generate'] = array( 'description' => 'Generate XMLsitemap records.', - 'arguments' => array( - 'max_chunk_size' => 'The number of records to grab each SOLR request', - 'limit' => 'The total number of records to grab before ending the process' - ), 'options' => array( + 'max_chunk_size' => 'The number of records to grab each SOLR request', + 'limit' => 'The total number of records to grab before ending the process (defaults to no limit)', 'regenerate' => 'If set will clear the last_modified date before generating.' - ), + ) ); return $items; } /** - * XMLsitemap Generate command. + * Validate the input for the islandora_xmlsitemap_generate command. + * + * Implements drush_hook_COMMAND_validate(). */ -function drush_islandora_xmlsitemap_generate($max_chunk_size = 100, $limit = null) { +function drush_islandora_xmlsitemap_generate_validate() { + // Validate max_chunk_size option + $max_chunk_size = drush_get_option('max_chunk_size', NULL); + if ($max_chunk_size !== NULL) { + if (!islandora_xmlsitemap_is_integer_positive($max_chunk_size) || $max_chunk_size == 0) { + return drush_set_error('The option "max_chunk_size" must be a positive integer greater than 0.'); + } + } + + // Validate limit option + $limit = drush_get_option('limit', NULL); + if ($limit !== NULL) { + if (!islandora_xmlsitemap_is_integer_positive($limit) || $limit == 0) { + return drush_set_error('The option "limit" must be a positive integer greater than 0.'); + } + } + + // Validate regenerate option + $regenerate = drush_get_option('regenerate', NULL); + if ($regenerate !== NULL) { + if ($regenerate !== TRUE) { + return drush_set_error('The option "regenerate" can only be used without a value (--regenerate)'); + } + } + + return TRUE; +} + +/** + * Execute the islandora_xmlsitemap_generate command. + * + * Implements drush_hook_COMMAND(). + */ +function drush_islandora_xmlsitemap_generate() { // Remove last_modified value if we need to regenerate. $regenerate = drush_get_option('regenerate', NULL); if (isset($regenerate)) { @@ -38,5 +71,18 @@ function drush_islandora_xmlsitemap_generate($max_chunk_size = 100, $limit = nul // Start the process. module_load_include('inc', 'islandora_xmlsitemap', 'includes/batch'); - xmlsitemap_run_unprogressive_batch('islandora_xmlsitemap_get_batch', $max_chunk_size, $limit); + xmlsitemap_run_unprogressive_batch('islandora_xmlsitemap_get_batch', drush_get_option('max_chunk_size', 100), drush_get_option('limit', NULL)); +} + +/** + * Check if a (string) value is both not empty, an integer and positive. + * + * Validation logic taken from: + * https://api.drupal.org/api/drupal/includes%21form.inc/function/element_validate_integer_positive/7.x + * + * @return boolean + * TRUE if $value is a positive integer, FALSE if $value is not a positive integer + */ +function islandora_xmlsitemap_is_integer_positive($value) { + return ($value !== '' && (!is_numeric($value) || intval($value) != $value || $value <= 0)); } From 66d9eae8d7b8757c6d0bf38c31e438ce33e44064 Mon Sep 17 00:00:00 2001 From: Jan Jouke Tjalsma Date: Thu, 8 Feb 2018 12:57:55 +0100 Subject: [PATCH 07/12] Fix incorrect validation --- islandora_xmlsitemap.drush.inc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/islandora_xmlsitemap.drush.inc b/islandora_xmlsitemap.drush.inc index 24f567e..ef4a21d 100644 --- a/islandora_xmlsitemap.drush.inc +++ b/islandora_xmlsitemap.drush.inc @@ -33,7 +33,7 @@ function drush_islandora_xmlsitemap_generate_validate() { // Validate max_chunk_size option $max_chunk_size = drush_get_option('max_chunk_size', NULL); if ($max_chunk_size !== NULL) { - if (!islandora_xmlsitemap_is_integer_positive($max_chunk_size) || $max_chunk_size == 0) { + if (!islandora_xmlsitemap_is_integer_positive($max_chunk_size)) { return drush_set_error('The option "max_chunk_size" must be a positive integer greater than 0.'); } } @@ -41,7 +41,7 @@ function drush_islandora_xmlsitemap_generate_validate() { // Validate limit option $limit = drush_get_option('limit', NULL); if ($limit !== NULL) { - if (!islandora_xmlsitemap_is_integer_positive($limit) || $limit == 0) { + if (!islandora_xmlsitemap_is_integer_positive($limit)) { return drush_set_error('The option "limit" must be a positive integer greater than 0.'); } } @@ -84,5 +84,8 @@ function drush_islandora_xmlsitemap_generate() { * TRUE if $value is a positive integer, FALSE if $value is not a positive integer */ function islandora_xmlsitemap_is_integer_positive($value) { - return ($value !== '' && (!is_numeric($value) || intval($value) != $value || $value <= 0)); + if ($value !== '' && (!is_numeric($value) || intval($value) != $value || $value <= 0)) { + return FALSE; + } + return TRUE; } From e94b9ff4e588640d95064eeed3b0540fb67cbd0e Mon Sep 17 00:00:00 2001 From: Jan Jouke Tjalsma Date: Thu, 8 Feb 2018 13:01:42 +0100 Subject: [PATCH 08/12] Fixed bug introduced while working on error conditions --- includes/batch.inc | 7 ------- 1 file changed, 7 deletions(-) diff --git a/includes/batch.inc b/includes/batch.inc index dcc3d3b..6cb904a 100644 --- a/includes/batch.inc +++ b/includes/batch.inc @@ -142,13 +142,6 @@ function islandora_xmlsitemap_batch_operation($max_chunk_size = 100, $batch_limi // Return control to batch engine. return; - } - - ) { - // Display error message. - $context['message'] = t('No results or SOLR error'); - - } // Set total based on results (only once every batch). From 945897c7545d1d8c691c1922a2f2d9d8c5acda58 Mon Sep 17 00:00:00 2001 From: Jan Jouke Tjalsma Date: Thu, 8 Feb 2018 13:03:01 +0100 Subject: [PATCH 09/12] Improve error text --- includes/batch.inc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/includes/batch.inc b/includes/batch.inc index 6cb904a..53da73a 100644 --- a/includes/batch.inc +++ b/includes/batch.inc @@ -124,13 +124,13 @@ function islandora_xmlsitemap_batch_operation($max_chunk_size = 100, $batch_limi if ($hasSolrError || $noResults) { // SOLR error if ($hasSolrError) { - // Set message about error. + // Set message indicating a SOLR error. $context['message'] = t('SOLR error'); } // No results if ($noResults) { - // Set message about error. + // Set message indicating no results. $context['message'] = t('No results'); } From 235171d9b7d053e7c998bd52af7fcd67b8cbe7ec Mon Sep 17 00:00:00 2001 From: Jan Jouke Tjalsma Date: Thu, 8 Feb 2018 15:21:18 +0100 Subject: [PATCH 10/12] Update README.md --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index cf497e9..bc131ee 100644 --- a/README.md +++ b/README.md @@ -49,14 +49,14 @@ Please also note that objects marked as "inactive", whether manually or by using Larger sites with greater than 100,000 objects may encounter issues during the sitemap building process with the default configuration, such as the process hanging around a specific number indefinitely or exiting the process entirely before completion. These users may want to try unchecking the "Prefetch URL aliases during sitemap generation" option found on the xmlsitemap admin configuration page (/admin/config/search/xmlsitemap/settings) and trying the process again. ## Bulk generation using drush -There is a Drush command available for the generation of sitemap links. This command allows you to optionally fetch a limited amount of objects similar to the hook_cron() and lets you define a custom amount to be fetched at once from SOLR. +There is a Drush command available for the generation of sitemap links. This command allows you to optionally fetch a limited amount of objects (`limit`) similar to the hook_cron() and lets you define a custom amount to be fetched at once (`max_chunk_size`) from SOLR. Command: -`drush islandora_xmlsitemap_generate [--max_chunk_size=100] [limit=1000] [--regenerate]` +`drush islandora_xmlsitemap_generate [--max_chunk_size=100] [--limit=1000] [--regenerate]` -* `Max chunk size` defaults to 100 +* The `max_chunk_size` defaults to 100 * If no `limit` is set all objects will be processed -* The `--regenerate` flag removes the "last_modified" value so will cause processing to start at the beginning +* The `--regenerate` flag removes the "last_modified" value so will cause processing to start at the beginning. *Use with caution if you have lots of objects*. ## Documentation From c0e6dbe2cf029361f0469c125327f16a8cb670ad Mon Sep 17 00:00:00 2001 From: Jan Jouke Tjalsma Date: Thu, 8 Feb 2018 17:11:02 +0100 Subject: [PATCH 11/12] small fix: coding standards --- includes/batch.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/includes/batch.inc b/includes/batch.inc index 53da73a..a92834f 100644 --- a/includes/batch.inc +++ b/includes/batch.inc @@ -168,7 +168,7 @@ function islandora_xmlsitemap_batch_operation($max_chunk_size = 100, $batch_limi $sandbox['current_node'] = $row['PID']; // Log result for post-processing in the 'finished' callback. - $context['results'][] = "Add/update ".$sandbox['current_node']; + $context['results'][] = "Add/update " . $sandbox['current_node']; } //Drop results when finished with them. $sandbox['queryProcessor']->resetResults(); From 087dcf18eba1833f780a4abd9c4d6b13a9509f52 Mon Sep 17 00:00:00 2001 From: Jan Jouke Tjalsma Date: Fri, 9 Feb 2018 15:19:47 +0100 Subject: [PATCH 12/12] Improved Drush integration and fixed bug in chunk_size calculation. This fixes #1 and fixes #2. --- includes/batch.inc | 2 +- islandora_xmlsitemap.drush.inc | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/includes/batch.inc b/includes/batch.inc index a92834f..e7ae112 100644 --- a/includes/batch.inc +++ b/includes/batch.inc @@ -102,7 +102,7 @@ function islandora_xmlsitemap_batch_operation($max_chunk_size = 100, $batch_limi // the batch_limit. if (isset($sandbox['batch_limit'])) { // Max chunk size for limited batches. - $sandbox['max_chunk_size'] = min($sandbox['batch_limit'] - $sandbox['progress'], $sandbox['batch_limit']); + $sandbox['max_chunk_size'] = min($sandbox['batch_limit'] - $sandbox['progress'], $sandbox['max_chunk_size']); // Update SOLR limit. $sandbox['queryProcessor']->solrLimit = $sandbox['max_chunk_size']; diff --git a/islandora_xmlsitemap.drush.inc b/islandora_xmlsitemap.drush.inc index ef4a21d..49e6745 100644 --- a/islandora_xmlsitemap.drush.inc +++ b/islandora_xmlsitemap.drush.inc @@ -69,9 +69,17 @@ function drush_islandora_xmlsitemap_generate() { variable_del('islandora_xmlsitemap_last_modified_value'); } - // Start the process. + // Get the batch module_load_include('inc', 'islandora_xmlsitemap', 'includes/batch'); - xmlsitemap_run_unprogressive_batch('islandora_xmlsitemap_get_batch', drush_get_option('max_chunk_size', 100), drush_get_option('limit', NULL)); + $batch = islandora_xmlsitemap_get_batch(drush_get_option('max_chunk_size', 100), drush_get_option('limit', NULL)); + + // Set and configure the batch. + batch_set($batch); + $batch =& batch_get(); + $batch['progressive'] = FALSE; + + // Start processing + drush_backend_batch_process(); } /**