PHPのDOMDocumentがバイナリセーフよりさらに脆いものだった件
結論から言うと、DOMDocumentは、「改行」「タブ」「キャリッジリターン」以外のコントロールコードが入っていると、それ以降の文字列を無視してしまう。
ちなみに、バイナリセーフでないと叩かれているereg系はヌルバイトだけを切ってしまう。
なんという脆さ・・・。
検証スクリプト
<?php $html=<<<EOF <html> <head> <meta http-equiv="Content-Language" content="ja"> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> <meta name="GENERATOR" content="Microsoft FrontPage 6.0"> <meta name="ProgId" content="FrontPage.Editor.Document"> <title>ASCII Control Code</title> <meta name="Microsoft Border" content="b, default"> </head> <body> <h1>Control Code List</h1> <div>test document include control code %s at the point of after control code</div> </body> </html> EOF; echo "# DOMDocument\n"; checkCtrlCode('parse_domdocument'); echo "\n# str_replace\n"; checkCtrlCode('string_replace'); echo "\n# ereg_replace\n"; checkCtrlCode('not_binary_safe'); function parse_domdocument($html, $cc) { $dom = new DOMDocument(); $html = sprintf($html, $cc); @$dom->loadHTML($html); return $dom->saveHTML()."\n------\n"; } function string_replace($html, $cc) { $html = sprintf($html, $cc); $html = str_replace('%s', $cc, $html); return $html."\n------\n"; } function not_binary_safe($html, $cc) { $html = sprintf($html, $cc); $html = ereg_replace("%s", $cc, $html); return $html."\n------\n"; } function hex2bin($hex_str) { return pack("H*" , $hex_str); } function checkCtrlCode($callback) { global $html; $ctrl_code = array( "00" => 'NULl(ヌル)', "01" => 'Start Of Heading(ヘッダ開始)', "02" => 'Start of TeXt(テキスト開始)', "03" => 'End of TeXt(テキスト終了)', "04" => 'End Of Transmission(転送終了)', "05" => 'ENQuiry(問合せ)', "06" => 'ACKnowledge(肯定応答)', "07" => 'BELl(ベル)', "08" => 'Back Space(後退)', "09" => 'Horizontal Tabulation(水平タブ)', "0A" => 'Line Feed(改行)', "0B" => 'Vertical Tabulation(垂直タブ)', "0C" => 'Form Feed(改ページ)', "0D" => 'Carriage Return(復帰)', "0E" => 'Shift Out(シフトアウト)', "0F" => 'Shift In(シフトイン)', "10" => 'Data Link Escape(伝送制御拡張)', "11" => 'Device Control 1(装置制御1)', "12" => 'Device Control 2(装置制御2)', "13" => 'Device Control 3(装置制御3)', "14" => 'Device Control 4(装置制御4)', "15" => 'Negative AcKnowledge(否定応答)', "16" => 'SYNchronous idle(同期信号)', "17" => 'End of Transmission Block(転送ブロック終了)', "18" => 'CANcel(取消)', "19" => 'End of Medium(媒体終端)', "1A" => 'SUBstitute(置換)', "1B" => 'ESCape(拡張)', "1C" => 'File Separator(ファイル分離)', "1D" => 'Group Separator(グループ分離)', "1E" => 'Record Separator(レコード分離)', "1F" => 'Unit Separator(ユニット分離)', ); $code_array = array(); foreach ($ctrl_code as $code=>$name){ $r = $callback($html, hex2bin($code)); if (strpos($r, 'at the point of after control code') !== false){ printf("[%s]\n", $name); } } }
実行結果
[tohokuaiki@php5 work]$ php control_code.php
# DOMDocument
[Horizontal Tabulation(水平タブ)]
[Line Feed(改行)]
[Carriage Return(復帰)]
# str_replace
[NULl(ヌル)]
[Start Of Heading(ヘッダ開始)]
[Start of TeXt(テキスト開始)]
[End of TeXt(テキスト終了)]
[End Of Transmission(転送終了)]
[ENQuiry(問合せ)]
[ACKnowledge(肯定応答)]
[BELl(ベル)]
[Back Space(後退)]
[Horizontal Tabulation(水平タブ)]
[Line Feed(改行)]
[Vertical Tabulation(垂直タブ)]
[Form Feed(改ページ)]
[Carriage Return(復帰)]
[Shift Out(シフトアウト)]
[Shift In(シフトイン)]
[Data Link Escape(伝送制御拡張)]
[Device Control 1(装置制御1)]
[Device Control 2(装置制御2)]
[Device Control 3(装置制御3)]
[Device Control 4(装置制御4)]
[Negative AcKnowledge(否定応答)]
[SYNchronous idle(同期信号)]
[End of Transmission Block(転送ブロック終了)]
[CANcel(取消)]
[End of Medium(媒体終端)]
[SUBstitute(置換)]
[ESCape(拡張)]
[File Separator(ファイル分離)]
[Group Separator(グループ分離)]
[Record Separator(レコード分離)]
[Unit Separator(ユニット分離)]
# ereg_replace
[Start Of Heading(ヘッダ開始)]
[Start of TeXt(テキスト開始)]
[End of TeXt(テキスト終了)]
[End Of Transmission(転送終了)]
[ENQuiry(問合せ)]
[ACKnowledge(肯定応答)]
[BELl(ベル)]
[Back Space(後退)]
[Horizontal Tabulation(水平タブ)]
[Line Feed(改行)]
[Vertical Tabulation(垂直タブ)]
[Form Feed(改ページ)]
[Carriage Return(復帰)]
[Shift Out(シフトアウト)]
[Shift In(シフトイン)]
[Data Link Escape(伝送制御拡張)]
[Device Control 1(装置制御1)]
[Device Control 2(装置制御2)]
[Device Control 3(装置制御3)]
[Device Control 4(装置制御4)]
[Negative AcKnowledge(否定応答)]
[SYNchronous idle(同期信号)]
[End of Transmission Block(転送ブロック終了)]
[CANcel(取消)]
[End of Medium(媒体終端)]
[SUBstitute(置換)]
[ESCape(拡張)]
[File Separator(ファイル分離)]
[Group Separator(グループ分離)]
[Record Separator(レコード分離)]
[Unit Separator(ユニット分離)]