PHP制作百度词典查词采集器

更新时间:2023-04-06 20:02:47 阅读：评论：0

百度dict 采集样本

写的采集百度dict词典翻译后的所有结果数据，当然附带了13.5w单词库和采集简单的案例，这里我把写出的主要类dict.class.php放出来，项目地址http://github.com/widuu/baidu_dict，有需要的直接fork就可以了~么么哒，这东西用的人很少，所以有用的兄弟拿走了哈~

<?php/** * dict.class.php 采集百度词典翻译内容 * * @copyright      (c) 2014 widuu * @licen       /d/file/titlepic/www.widuu.com * @lastmodify     2014-2-15 */  header("content-type:text/html;chart=utf8");class dict{private $word;//显示的条数private static $num = 10;public function __construct(){}/**   * 公用返回百度采集数据的方法   * @param string 英文单词   * retun array( *symbol" => 音标 *"pro" => 发音 *"example"=> 例句 *"explain"=> 简明释义 *"synonym"=> 同反义词 *"phra" => 短语数组 *)   * */public function content($word){ $this -> word = $word; $symbol = $this -> pronounced(); $pro = $this->getsay(); $example = $this -> getexample(); $explain = $this -> getexplain(); $synonym = $this -> getsynonym(); $phra = $this -> getphra(); $result = array("symbol" => $symbol,//音标"pro" => $pro,//发音"example"=> $example,//例句"explain"=> $explain,//简明释义"synonym"=> $synonym,//同反义词"phra" => $phra //短语数组);return $result;}/**   * 远程获取百度翻译内容   * get function curl   * retun string   * */private function getcontent(){ $uragent = "mozilla/5.0 (windows nt 6.1; wow6改革开放四十年4; rv:23.0) gecko/20100101 firefox/23.0"; $ch = curl_init(); $url = "/d/file/titlepic/s curl_topt($ch, curlopt_url, $url); curl_topt($ch, curlopt_uragent,$uragent);curl_topt($ch, curlopt_returntransfer, true); curl_topt($ch, curlopt_followlocation, 1); curl_topt($ch, curlopt_httpget, 1);curl_topt($ch, curlopt_autoreferer,1);curl_topt($ch, curlopt_header, 0); curl_topt($ch, curlopt_timeout, 30);$result = curl_exec($ch);if (curl_errno($curl)) {echo 'errno'.curl_error($curl);}curl_clo($ch);return $result;}/**   * 获取百度翻译发音   * retun array(英，美)   * */private function pronounc怎么查询学历ed(){$data = $this -> getcontent();preg_match_all("/\"en\-us\"\>(.*)\<\/b\>/ui",$data,$pronounced);return array('en' => $pronounced[1][0],'us' => $pronounced[1][1]);}/** * 获取百度翻译发音 * return array(英，美) * */private function getsay(){$data = $this -> getcontent();preg_match_all("/url=\"(.*)\"/ui",$data,$pronounced);return array('en' => $pronounced[1][0],'us' => $pronounced[1][1]);}/**   * 获取百度翻译例句   * return array() 多维数组 例句   *  */private function getexample(){$str = "";$data = $this -> getcontent();preg_match_all("/var example_data = (.*)\]\;/us",$data,$example);  $data1 = "[[[".ltrim($example[1][0],"[");  $data2 = explode("[[[",$data1);  $num = count(array_filter($data2));foreach($data2 as $key => $value){ $data3 = explode("[[","[[".$value); foreach ($data3 as $k => $v) { preg_match_all("/\[\"(.*)\",/us","[".$v, $match); if(!empty($match[1])){ $str .= implode($match[1]," ")."@"; } }}$data4 = trim($str,"@");$data5 = explode("@", $data4);$result = array_chunk($data5, 2);return $result;}/**   * 获取简明释义   * return array (x => "词性"，b => "附属")   *  **/private function getexplain(){$data = $this -> getcontent();preg_match_all("/id\=\"en\-simple\-means\"\>(.*)\<div(\s+)class\=\"source\"\>/us",$data,$explain);$r_data = $explain[1][0];preg_match_all("/\<p\>\<strong\>(?p<adj>.*)\<\/strong\>\<span\>(?p<name>.*)\<\/span\>\<\/p\>/us", $r_data, $a_data);preg_match_all("/\<span\>(?p<tag>[^\>]+)\：\<a(\s+)href\=\"(.*)\"\>(?p<word>.*)\<\/a\>\<\/span\>/us", $r_data, $b_data);$result = array();foreach ($a_data["adj"] as $key => $value) {$result[$value] = $a_data["name"][$key];}$word_b = array();foreach ($b_data["tag"] as $key => $value) {$word_b[$value] = strip_tags($b_data["word"][$key]);}$result_data = array("x" => $result,"b" => $word_b); return $result_data;}/**   * 获取同义词   * return array(0 => "同义词", 1 => "反义词") 一般为多维数组   *  */private function getsynonym(){$data = $this -> getcontent();preg_match_all("/id=\"en\-syn\-ant\"\>(.*)<div(\s+)class\=\"source\">/us",$data,$synonym);$content = $synonym[1][0];$data1 = explode("</dl>", $content);$result = array();$data2 = array();foreach ($data1 as $key => $value) {preg_match_all("/\<strong\>(?p<adj>.*)\\;\<\/strong\>\<\/div\>\<div(\s+)class\=\"syn\-ant\-list\"\>\<ul\>(?<content>.*)\<\/ul\>/us", $value, $r_data);$data2[$key]["adj"] = $r_data["adj"];$data2[$key]["content"] = $r_data["content"];}foreach ($data2 as $key => $value) {foreach ($value["content"] as $k => $v) {if(!empty($v)){preg_match_all("/\<li\>\<p\>(?p<title>.*)\<\/p\>(?p<value>.*)\<\/li>/us", $v, $v_data);foreach ($v_data['title'] as $m => $d) {$data = strip_tags(preg_replace("<</a>>"," ", $v_data["value"][$m]));$result[$key][$value["adj"][$k]][$d] = $data;}}}} return $resul秋天来了作文t;}/**   * 获取短语词组   * return array (key => value) 一维或者多维数组   *  */private function getphra(){$num = lf::$num;$data = $this -> getcontent();preg_match_all("/id=\"en\-phra\"\>(.*)\<div class\=\"source\"\>/us",$data,$p高考励志歌曲hra);$data = explode("</dd>",$phra[1][0]);$data1 = array_slice($data,0,$num);$result = array();foreach ($data1 as $key => $value) {$data2 = explode("</p>", $value);$n = count($data2);if($n<=3){$result[str_replace("","",strip_tags($data2[0]))] = strip_tags($data2[1]);}el{$data3 = array_slice($data2,0,$n-1);$d八个适宜在农村创业致富的项目ata4 = array_slice($data2,0,2);$res = array_diff($data3,$data4);$data5 = array_chunk($res,2);$key_value = trim(str_replace("","",strip_tags($data4[0])));$result[$key_value] = strip_tags($data4[1]);foreach ($data5 as $key => $value) {foreach ($value as $k => $v) {$value[$k] = strip_tags($v);}$array = array($result[$key_value],$value);if (array_key_exists($key_value, $result)){$result[$key_value] = $array;}}}}return $result;}/** * 将数组转换为字符串 * * @param  array  $data    数组 * @param  bool  $isformdata 如果为0，则不使用new_stripslashes处理，可选参数，默认为1 * @return  string 返回字符串，如果，data为空，则返回空 */private function array2string($data, $isformdata = 1) {  if($data == '') return '';  if($isformdata) $data = $this->new_stripslashes($data);  return addslashes(var_export($data, true));}/** * 返回经stripslashes处理过的字符串或数组 * @param $string 需要处理的字符串或数组 * @return mixed */private function new_stripslashes($string) {  if(!is_array($string)) return stripslashes($string);  foreach($string as $key => $val) $string[$key] = $this->new_stripslashes($val);  return $string;}}// $word = new dict("express");// $word ->content();

以上就是本文的全部内容了，非常实用的功能，希望小伙伴们能够喜欢。

本文发布于:2023-04-06 20:02:42，感谢您对本站的认可！

本文链接：https://www.wtabcd.cn/fanwen/zuowen/e6b7a3168e14f81fa2687cedd1021983.html

本文word下载地址：PHP制作百度词典查词采集器.doc

本文 PDF 下载地址：PHP制作百度词典查词采集器.pdf

上一篇：安抚奶嘴乳胶好还是硅胶好

下一篇：返回列表