闲来无事就做个图片抓取。
把某网站的图片抓取后,保存在本地指定目录,同时生成写入数据库的sql语句。
思路如下:
- 获取页面图的url
- 根据图片url下载到本地
- 把下载好的图片存储到指定目录,同时生成写入数据库的sql
完整代码如下:
<?php
class RetileImg
{
protected $url; //请求地址
protected $url_status = false; // false是http请求 true是https请求
protected $img_path; //图片存储地址
protected $sql_path; //sql语句保存地址
protected $domain_name; //域名 如果有图片是相对路径就需要填写域名
public function __construct($url, $url_status, $img_path, $sql_path, $domain_name)
{
$this->url = $url;
$this->url_status = $url_status;
$this->img_path = $img_path . "/" . date("Ymd") . "/";
$this->sql_path = $sql_path . "/";
$this->domain_name = $domain_name;
}
/**获取页面
* @return bool|string
*/
public function curl_web()
{
$url = $this->url;
$url_status = $this->url_status;
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 100);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 100);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, $url_status);
$ch_content = curl_exec($ch);
return $ch_content;
}
/**匹配页面的图片路径
* @param $content
* @return mixed|null
*/
public function get_img_from_html($content)
{
$pattern = "/<img.*?src=[\'|\"](.*?)[\'|\"].*?[\/]?>/";
$html_data = htmlspecialchars_decode($content);
preg_match_all($pattern, $html_data, $match);
if (!empty($match[1])) {
return $match[1];
}
return null;
}
/**获取页面的所有图片路径
* @return mixed|null
*/
public function get_img_urls()
{
$url = $this->url;
$html_data = $this->curl_web($url);
$img_urls = $this->get_img_from_html($html_data) ?? null;
$domain_name = $this->domain_name;
$urls = [];
foreach ($img_urls as $k => $v) {
if (empty($v)) {
unset($k[$v]);
}
$http_top = mb_substr($v, 0, 4);
if ($http_top != 'http' && !empty($domain_name)) {
$v = $domain_name . $v;
};
$urls[] = $v;
}
return $urls;
}
/**下载图片到本地
* @throws Exception
*/
public function download()
{
$img_urls = $this->get_img_urls();
$img_path = $this->img_path;
$sql_path = $this->sql_path;
$mimes = array(
'bmp',
'gif',
'jpg',
'png',
);
$sql = "insert into img_data ('img_name','img_url','img_description','add_time') value (";
foreach ($img_urls as $k => $v) {
$ext = mb_substr($v, -3);
// 如果符合我们要的类型
if (in_array($ext, $mimes)) {
$number = random_int(10, 99999);
$img_name = date("YmdHis") . $number;
$content = file_get_contents($v);
if (!is_dir($img_path)) {
mkdir($img_path, 0777, true);
}
//echo "图片下载" . $content . PHP_EOL;
$file_name = md5($img_name);
$file_path = $img_path . $file_name . "." . $ext;
file_put_contents($file_path, $content);
$sql .= "'" . $file_name . "',";
$sql .= "'" . $file_path . "',";
$sql .= "'" . $file_name . "',";
$sql .= date("Y-m-d H:i:s");
}
}
$sql .= ");";
if (!is_dir($sql_path)) {
mkdir($sql_path, 0777, true);
}
file_put_contents($sql_path . "sql.txt", $sql);
}
}
//执行图片获取
header("Content-type: text/html; charset=utf-8");
$img_data = new RetileImg(
"https://erik.xyz/2014/10/17/zhe-shi-yi-ge-kai-shi-de-jie-shu/",
true,
"./img",
"./sql",
"https://erik.xyz"
);
$img_data->download();
本文作者:
艾瑞可erik
本文链接: https://erik.xyz/2019/09/03/zhua-qu-tu-pian-dao-ben-di-bing-sheng-cheng-sql/
版权声明: 本作品采用 知识共享署名-非商业性使用-相同方式共享 4.0 国际许可协议 进行许可。转载请注明出处!
本文链接: https://erik.xyz/2019/09/03/zhua-qu-tu-pian-dao-ben-di-bing-sheng-cheng-sql/
版权声明: 本作品采用 知识共享署名-非商业性使用-相同方式共享 4.0 国际许可协议 进行许可。转载请注明出处!