关于php:php-tp5-爬取淘宝天猫店铺的信息

2次阅读

共计 3241 个字符,预计需要花费 9 分钟才能阅读完成。

淘宝店铺的信息的爬取

间接上代码

    // 获取到传过来的链接
    $link = input('link');   
    $content = $this->getRquest($link);
    // 进行本义
    $content = mb_convert_encoding($content, 'utf-8','GB2312');
    // 引入 PHPquery 工具 -- 寄存门路  /extend  目录下
    \think\Loader::import('phpQuery.phpQuery', EXTEND_PATH);
    $html  = \phpQuery::newDocumentFiles($link);
    // 下载地址 ---https://code.google.com/archive/p/phpquery/downloads
    // 如果下载不下来, 能够间接百度寻找    或者留言邮箱 我看到后会发送 
    // 获取店铺名
    $shop_name= pq(".shop-name>a")->text();
    $shop_name = mb_convert_encoding($shop_name,'ISO-8859-1','utf-8');
    $shop_name = mb_convert_encoding($shop_name,'utf-8','GBK');
    $shop_name = trim(str_replace("进入店铺","",$shop_name));

    // 获取掌柜号
    preg_match_all('/<a class=\"seller-name J_TGoldlog\"[\s\S]*?target=\"_blank\"> 掌柜:([\s\S]* ?)<\/a><br>/',$content,$store_accounts);
    if (!$store_accounts[0]) {preg_match_all('/<p class=\"info-item\"[\s\S]*?"><span class="title"> 掌 [\s\S]*? 柜:<\/span>([\s\S]*?)<\/p>[\s\S]*?<span class="title"> 客 [\s\S]*? 服:/',$content,$store_accounts);
    }
   $owner = trim($store_accounts[1][0]);
   
    
    // 此办法单纯的 curl
    private function getRquest($url)
    {$headers = $this->randIp();
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
        curl_setopt($ch, CURLOPT_HEADER, 0);
        curl_setopt($ch, CURLOPT_ENCODING, "");
        // 3. 执行并获取 HTML 文档内容
        $output = curl_exec($ch);
        // 4. 开释 curl 句柄
        curl_close($ch);
        return $output;
    }
    
    // 也能够应用此办法  减少了 header 头 以及结构随机的 ip
    private function getHeaderRequest($url){
        // 结构随机 ip
        $ip_long = array(array('607649792', '608174079'), //36.56.0.0-36.63.255.255
            array('1038614528', '1039007743'), //61.232.0.0-61.237.255.255
            array('1783627776', '1784676351'), //106.80.0.0-106.95.255.255
            array('2035023872', '2035154943'), //121.76.0.0-121.77.255.255
            array('2078801920', '2079064063'), //123.232.0.0-123.235.255.255
            array('-1950089216', '-1948778497'), //139.196.0.0-139.215.255.255
            array('-1425539072', '-1425014785'), //171.8.0.0-171.15.255.255
            array('-1236271104', '-1235419137'), //182.80.0.0-182.92.255.255
            array('-770113536', '-768606209'), //210.25.0.0-210.47.255.255
            array('-569376768', '-564133889'), //222.16.0.0-222.95.255.255
        );
        $rand_key = mt_rand(0, 9);
        $ip= long2ip(mt_rand($ip_long[$rand_key][0], $ip_long[$rand_key][1]));
        $ch = curl_init();
        //cookie 值应用本人的, 能够在申请头中找到
        $headers = array("Content-type: text/xml;charset=\"utf-8\"","Accept: text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1","Cache-Control: no-cache","Pragma: no-cache","cookie:lid=%E9%87%8E%E7%8B%BChy; cna=B129FQUBcC0CAXoEOzNJdBZo; ali_apache_track=c_mid=b2b-1791803016|c_lid=%E9%87%8E%E7%8B%BChy|c_ms=1; UM_distinctid=16cb31d9d5fbdc-0f653114eac331-SD; _is_show_loginId_chang-gsd6_false; __rn_alert__=false; isg=BD4-dSADSADs; l=dSD-sdSD-VC..","user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",'CLIENT-IP:'.$ip,'X-FORWARDED-FOR:'.$ip
        );
        curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch, CURLOPT_HEADER, 0);
        curl_setopt($ch, CURLOPT_ENCODING, "");        // 3. 执行并获取 HTML 文档内容
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1);
        $output = curl_exec($ch);
        // 4. 开释 curl 句柄
        curl_close($ch);
        return $output;
    }


天猫店铺

    // 值获取能够依据获取到信息进行查找
    $shop_name= pq("#shopExtra>.slogo>.slogo-shopname>strong")->text();
    $owner =  pq(".extend>ul>.shopkeeper>.right>a")->text();
正文完
 0