=== modified file 'sites/all/modules/import_html/import_html.module'
--- sites/all/modules/import_html/import_html.module 2010-02-10 00:09:24 +0000
+++ sites/all/modules/import_html/import_html.module 2010-02-11 04:51:36 +0000
@@ -383,6 +383,7 @@
'recursion_behaviour' => IMPORT_HTML_GLOB_BEFORE,
'debug_level' => 0,
'keep_temp_files' => FALSE,
+ 'pretidy_cmd' => '',
);
}
=== modified file 'sites/all/modules/import_html/import_html_process.inc'
--- sites/all/modules/import_html/import_html_process.inc 2010-02-09 23:13:51 +0000
+++ sites/all/modules/import_html/import_html_process.inc 2010-02-11 03:55:25 +0000
@@ -523,6 +523,65 @@
return TRUE;
}
+/**
+ * 'pre-tidy' a file: run a parser on the HTML file, _before_ running HTMLtidy on it
+ *
+ *
+ * Code largely copied from xml_tidy_file()
+ */
+function import_html_pretidy_file($filepath, $pretidy_cmd) {
+ import_html_debug("Pre-tidying file '$filepath' ");
+
+ if (! is_local($filepath)) {
+ // OK, so it's a remote file and I have to process it on the command-line ...
+ // Copy it down
+ $source = file_get_contents($filepath);
+ if (! $source) {
+ trigger_error("No content from '$filepath'", E_USER_WARNING );
+ return FALSE;
+ }
+ import_html_debug("Retrieved remote file:$filepath is ". strlen($source) ." big");
+ // put it in a temp place
+ $target_path = tempnam($_ENV['TEMP'], "htm");
+ file_put_contents( foreslash($target_path) , $source );
+ }
+ else {
+ $target_path = $filepath;
+ }
+
+ if (! file_exists($target_path) ) {
+ import_html_debug("Attempted to pre-tidy a file that doesn't exist. Looking for $target_path failed!", array(), WATCHDOG_ERROR);
+ return;
+ }
+
+ $command = $pretidy_cmd . ' "' . foreslash($target_path) . '"';
+
+ // TODO - check for exploits here? What could happen?
+ import_html_debug("Running \n$command");
+ $result = exec($command, $response, $return_code);
+ $out = join("\n", $response);
+
+ if (!$out) {
+ // run the exact same command again, but collect the errors this time
+ $result = exec($command .' 2>&1', $response, $return_code);
+ $out = join("\n", $response);
+
+ trigger_error(
+ "The pre-tidy command failed to parse the input!
+ I ran \n$command\n
+ and got: $return_code
". htmlspecialchars($out) ."\n" , E_USER_WARNING); + $out = FALSE; + } + + import_html_debug_code("After command-line tidy", $out); + + if ($target_path != $filepath) { + // remove temp file we just made up. + unlink($target_path); + } + + return $out; +} /** * Analyse a source page and create a node definition from it. @@ -572,6 +631,28 @@ trigger_error("Path '$path' was not found. This should have been a local copy of the file being imported, but the paths may be wrong somehow. Abject failure processing $rel_path"); } + if ($profile['pretidy_cmd']) { + $data = import_html_pretidy_file($path, $profile['pretidy_cmd']); + if ($data) { + // Right; now we have 'pre-tidied' output in a string. + // However I don't feel safe calling parse_in_xml_string() rather than parse_in_xml_file() + // because that has a different 'handling'. (Is sometimes called twice.) + // In order not to touch existing code, we'll write to a file. Can always change that later. + $temp_path = foreslash( tempnam($_ENV['TEMP'], "htm") ); + file_put_contents($temp_path, $data); + if (! file_exists($temp_path)) { + import_html_debug( + "Failed to create/write temp file '%path' with pre-tidied output. Will continue parsing 'un-pre-tidied' file.", + array('path' => $temp_path) + ); + $temp_path = ''; + } + } + } + if (!$temp_path) { + $temp_path = $path; + } + /* * Trying to parse pure XML first is causing problems * Either I want everything to be html, (always tidy) @@ -581,19 +662,24 @@ */ // temporarily ignore parser errors (catch?) set_error_handler('stfu'); - $xmldoc = parse_in_xml_file($path, $profile['force_tidy']); + $xmldoc = parse_in_xml_file($temp_path, $profile['force_tidy']); restore_error_handler(); if (! $xmldoc && $profile['force_tidy'] ) { import_html_debug( "%path was not tidy enough - running tidy over it now so I can parse it.", - array('%path' => $path, '%rel_path' => $rel_path) + array('%path' => $temp_path, '%rel_path' => $rel_path) ); // If a raw XML parse failed, // tell parse_in_xml_file() to use htmlTidy before it begins // TODO - add a flag to skip this double-processing, (parsing twice) it may be a bit slow if it's not often used - $xmldoc = parse_in_xml_file($path, TRUE); + $xmldoc = parse_in_xml_file($temp_path, TRUE); + } + if ($temp_path != $path) { + // remove temp file we just made up. + unlink($temp_path); } + #import_html_debug_code("Finished reading from file:", xml_tostring($xmldoc)); $source_node = new stdClass(); } === modified file 'sites/all/modules/import_html/import_html_ui.inc' --- sites/all/modules/import_html/import_html_ui.inc 2010-02-18 22:10:33 +0000 +++ sites/all/modules/import_html/import_html_ui.inc 2010-02-18 22:18:17 +0000 @@ -589,7 +589,17 @@ files/import directory. "), ); - + $form['advanced']['pretidy_cmd'] = array( + '#type' => 'textfield', + '#title' => t("Pre-tidy command"), + '#default_value' => $profile['pretidy_cmd'], + '#description' => t(" + A command to run on each HTML file, before any processing and before HTML Tidy is run. + This is only necessary if your files contain such faulty HTML that it even confuses HTML Tidy + (like old MS FrontPage files containing insanely placed start/end tags). + The command you specify will be run with the full filename appended (acting as a commandline argument). It must output the 'pre-tidied' HTML on STDOUT. + "), + ); $form['advanced']['import_html_other_logic'] = array( '#value' => t("