=== modified file 'sites/all/modules/import_html/import_html.module' --- sites/all/modules/import_html/import_html.module 2010-02-10 00:09:24 +0000 +++ sites/all/modules/import_html/import_html.module 2010-02-11 04:51:36 +0000 @@ -383,6 +383,7 @@ 'recursion_behaviour' => IMPORT_HTML_GLOB_BEFORE, 'debug_level' => 0, 'keep_temp_files' => FALSE, + 'pretidy_cmd' => '', ); } === modified file 'sites/all/modules/import_html/import_html_process.inc' --- sites/all/modules/import_html/import_html_process.inc 2010-02-09 23:13:51 +0000 +++ sites/all/modules/import_html/import_html_process.inc 2010-02-11 03:55:25 +0000 @@ -523,6 +523,65 @@ return TRUE; } +/** + * 'pre-tidy' a file: run a parser on the HTML file, _before_ running HTMLtidy on it + * + * + * Code largely copied from xml_tidy_file() + */ +function import_html_pretidy_file($filepath, $pretidy_cmd) { + import_html_debug("Pre-tidying file '$filepath' "); + + if (! is_local($filepath)) { + // OK, so it's a remote file and I have to process it on the command-line ... + // Copy it down + $source = file_get_contents($filepath); + if (! $source) { + trigger_error("No content from '$filepath'", E_USER_WARNING ); + return FALSE; + } + import_html_debug("Retrieved remote file:$filepath is ". strlen($source) ." big"); + // put it in a temp place + $target_path = tempnam($_ENV['TEMP'], "htm"); + file_put_contents( foreslash($target_path) , $source ); + } + else { + $target_path = $filepath; + } + + if (! file_exists($target_path) ) { + import_html_debug("Attempted to pre-tidy a file that doesn't exist. Looking for $target_path failed!", array(), WATCHDOG_ERROR); + return; + } + + $command = $pretidy_cmd . ' "' . foreslash($target_path) . '"'; + + // TODO - check for exploits here? What could happen? + import_html_debug("Running \n$command"); + $result = exec($command, $response, $return_code); + $out = join("\n", $response); + + if (!$out) { + // run the exact same command again, but collect the errors this time + $result = exec($command .' 2>&1', $response, $return_code); + $out = join("\n", $response); + + trigger_error( + "The pre-tidy command failed to parse the input! + I ran \n$command\n + and got: $return_code
". htmlspecialchars($out) ."
\n" , E_USER_WARNING); + $out = FALSE; + } + + import_html_debug_code("After command-line tidy", $out); + + if ($target_path != $filepath) { + // remove temp file we just made up. + unlink($target_path); + } + + return $out; +} /** * Analyse a source page and create a node definition from it. @@ -572,6 +631,28 @@ trigger_error("Path '$path' was not found. This should have been a local copy of the file being imported, but the paths may be wrong somehow. Abject failure processing $rel_path"); } + if ($profile['pretidy_cmd']) { + $data = import_html_pretidy_file($path, $profile['pretidy_cmd']); + if ($data) { + // Right; now we have 'pre-tidied' output in a string. + // However I don't feel safe calling parse_in_xml_string() rather than parse_in_xml_file() + // because that has a different 'handling'. (Is sometimes called twice.) + // In order not to touch existing code, we'll write to a file. Can always change that later. + $temp_path = foreslash( tempnam($_ENV['TEMP'], "htm") ); + file_put_contents($temp_path, $data); + if (! file_exists($temp_path)) { + import_html_debug( + "Failed to create/write temp file '%path' with pre-tidied output. Will continue parsing 'un-pre-tidied' file.", + array('path' => $temp_path) + ); + $temp_path = ''; + } + } + } + if (!$temp_path) { + $temp_path = $path; + } + /* * Trying to parse pure XML first is causing problems * Either I want everything to be html, (always tidy) @@ -581,19 +662,24 @@ */ // temporarily ignore parser errors (catch?) set_error_handler('stfu'); - $xmldoc = parse_in_xml_file($path, $profile['force_tidy']); + $xmldoc = parse_in_xml_file($temp_path, $profile['force_tidy']); restore_error_handler(); if (! $xmldoc && $profile['force_tidy'] ) { import_html_debug( "%path was not tidy enough - running tidy over it now so I can parse it.", - array('%path' => $path, '%rel_path' => $rel_path) + array('%path' => $temp_path, '%rel_path' => $rel_path) ); // If a raw XML parse failed, // tell parse_in_xml_file() to use htmlTidy before it begins // TODO - add a flag to skip this double-processing, (parsing twice) it may be a bit slow if it's not often used - $xmldoc = parse_in_xml_file($path, TRUE); + $xmldoc = parse_in_xml_file($temp_path, TRUE); + } + if ($temp_path != $path) { + // remove temp file we just made up. + unlink($temp_path); } + #import_html_debug_code("Finished reading from file:", xml_tostring($xmldoc)); $source_node = new stdClass(); } === modified file 'sites/all/modules/import_html/import_html_ui.inc' --- sites/all/modules/import_html/import_html_ui.inc 2010-02-18 22:10:33 +0000 +++ sites/all/modules/import_html/import_html_ui.inc 2010-02-18 22:18:17 +0000 @@ -589,7 +589,17 @@ files/import directory. "), ); - + $form['advanced']['pretidy_cmd'] = array( + '#type' => 'textfield', + '#title' => t("Pre-tidy command"), + '#default_value' => $profile['pretidy_cmd'], + '#description' => t(" + A command to run on each HTML file, before any processing and before HTML Tidy is run. + This is only necessary if your files contain such faulty HTML that it even confuses HTML Tidy + (like old MS FrontPage files containing insanely placed start/end tags). + The command you specify will be run with the full filename appended (acting as a commandline argument). It must output the 'pre-tidied' HTML on STDOUT. + "), + ); $form['advanced']['import_html_other_logic'] = array( '#value' => t("