先贴个代码留个坑.
爬虫类-爬取网站的图片
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
class Image_Spider {
protected $curl_handle; //curl句柄
protected $host; //域名
protected $scheme; //协议 (http or https)
protected $save_path; //存放地址
const IMAGE_SUFFIX = ['.gif', '.jpeg', '.jpg', '.png', '.bmp']; //合法的图片后缀
//初始化 (需要爬取的网址,爬取下来的图片所存放的绝对路径目录(相对目录也可以))
public function __construct($url, $path='/home/yy/my_image') {
$this->save_path = $path;
$this->curl_handle = curl_init();
curl_setopt($this->curl_handle, CURLOPT_URL, $url);
curl_setopt($this->curl_handle, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($this->curl_handle, CURLOPT_HEADER, FALSE);
curl_setopt($this->curl_handle, CURLOPT_USERAGENT, 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36');
curl_setopt($this->curl_handle, CURLOPT_REFERER, 'http://www.baidu.com');
$_url = parse_url($url);
$this->scheme = $_url['scheme'];
$this->host = $_url['host'];
}
public function __destruct() {
curl_close($this->curl_handle);
unset($this->curl_handle);
unset($this->save_path);
}
//根据图片的二进制流判断类型
public function check_image_type($image) {
$bits = array(
'jpg' => "\xFF\xD8\xFF",
'jpeg' => "\xFF\xD8\xFF",
'gif' => "GIF",
'png' => "\x89\x50\x4e\x47\x0d\x0a\x1a\x0a",
'bmp' => 'BM'
);
foreach ($bits as $type => $bit) {
if (substr($image, 0, strlen($bit)) === $bit) {
return $type;
}
}
return FALSE;
}
//获取图片的链接
public function get_images_url() {
$content = curl_exec($this->curl_handle);
preg_match_all('/<img\s*.*\s*src=[\'"](.*)[\'"]\s*.*\s*>/Ui', $content, $match);
//preg_match_all('/<img src=(.*)>/Ui', $content, $match);
return $match[1];
}
//保存图片
public function save_image($image_url) {
$image = file_get_contents($image_url);
$filename = date('ymd').'_'.uniqid();
//判断后缀是一个正确的图片后缀,但是不一定正确
/*
foreach (self::IMAGE_SUFFIX as $suffix) {
if (stripos($image_url, $suffix) !== FALSE) {
$filename = $filename . $suffix;
file_put_contents($this->save_path . $filename, $image);
return TRUE;
}
}
*/
$type = $this->check_image_type($image);
if ($type === FALSE) {
return FALSE;
}
$filename = $filename . '.' . $type;
file_put_contents($this->save_path.$filename, $image);
return FALSE;
}
//将上面两个方法合并
public function save_action() {
//要是目录不存在的话就创建
if (!is_dir($this->save_path)) {
mkdir($this->save_path);
}
$path_length = strlen($this->save_path);
//在save_path后面添加一个
if ($path_length && mb_substr($this->save_path, $path_length - 1, 1) != DIRECTORY_SEPARATOR) {
$this->save_path = $this->save_path . DIRECTORY_SEPARATOR;
}
$image_urls = $this->get_images_url();
foreach ($image_urls as $image_url) {
//如果匹配下来的img标签中的链接不是一个正确的网址地址
if (mb_substr($image_url, 0, 1) == '/') {
if (mb_substr($image_url, 1, 1) == '/') {
$image_url = $this->scheme . '://' . mb_substr($image_url, 2);
} else {
$image_url = $this->scheme . '://' . $this->host . mb_substr($image_url, 1);
}
} else if (stripos($image_url, 'http') === FALSE) {
continue;
}
$this->save_image($image_url);
}
}
}
$url = 'http://www.nipic.com/index.html';
$obj = new Image_Spider($url);
$obj->save_action();