Quantcast
Channel: CNode:Node.js专业中文社区
Viewing all articles
Browse latest Browse all 14821

node写爬虫的问题

$
0
0

数据量不是特别大,但有很多层循环嵌套,程序很多时候跑到一半终端就未响应了。。。不报错,也不再继续执行,,,

async function init(_s, _e) { 
   for (let _i = _s; _i < _e; _i++) {
        let _isexists = fs.existsSync(`page_${_i}`);
        let _mk = 0;      
        if (_isexists == false){
            fs.mkdirSync(`${__dirname}/page_${_i}`)
        }
        if (!_mk) {
            console.log(`>>正在分析页面-${_i}`)
            let data = await whttps.gets('https://www.thefashionisto.com/fashion/campaign/?vpage=' + _i)
            if (data) {
                $ = cheerio.load(data);
                console.log('>>分析页面完成,加载整个dom文档,获得所需要的二级url')
            }
            $('._self').each(async function(i, item) {
                if ($(this).html() == 'Read More') {
                    let curl = $(this).attr('href');
                    let cdata = await whttps.gets(curl); 
                   

					if (cdata) {
                        let name = curl.replace(/(https:\/\/.*?\/)|(\/)/g, ''); 
                       console.log(`${__dirname}/page_${_i}/${name}`)
                        let isexists = fs.existsSync(`${__dirname}/page_${_i}/${name}`); 
                       let mk_r = 0; 
                       if (isexists == false) {
                            mk_r = fs.mkdirSync(`${__dirname}/page_${_i}/${name}`)
                                // console.log('建立文件夹后的结果', mk_r)
                        };
                        if (!mk_r) {
                            c$ = cheerio.load(cdata); 
                           console.log(`>> 获得详情页面,分析详情页面图片`); 
                           c$('.entry-content figure img').each(async function(i) {
                                let img_url = c$(this).attr('src');
                                let img_name = img_url.replace(/(.*\/)|(\..*)/g, ''); 
                               if (fs.existsSync(`${__dirname}/page_${_i}/${name}/${img_name}.jpg`)) { 
                                   console.log('文件存在,跳过该图片'); 
                               } else {
                                    let img_data = await whttps.gets(img_url, 'binary') 
                                   if (img_data) {
                                        
                                    fs.writeFile(`${__dirname}/page_${_i}/${name}/${img_name}.jpg`, img_data, 'binary', 
									function(error) {
                                            if (error) {
                                                console.log(error);
												return false
                                            };
											console.log(`>> 图片${img_name}保存成功`)
                                        })
                                    }
                                }
                            }); 
                           let relation_img = await whttps.gets(curl + '?relatedposts=1')
                            relation_img = JSON.parse(relation_img)
                            relation_img.items.forEach(async(element, i) => {
                                let img_url = element.img.src; 
                               let img_name = img_url.replace(/(.*\/)|(\..*)/g, ''); 
                               if (fs.existsSync(`${__dirname}/page_${_i}/${name}/small-${img_name}.jpg`)) { 
                                   console.log('文件存在,跳过该图片') 
                               } else { 
                                   let img_data = await whttps.gets(img_url, 'binary') 
                                   if (img_data) { 
                                       fs.writeFile(`${__dirname}/page_${_i}/${name}/small-${img_name}.jpg`, img_data, 'binary',
									   function(error, data) {
                                            if (error) {
                                                console.log(error)
                                                return false  

                                          }; 
										  console.log(`>> 图片${img_name}保存成功`) 
                                       })
                                    }  
                              }
                            }); 
                       }  
                  }  
              }  
          });  
      } 
   }}

来自酷炫的 CNodeMD


Viewing all articles
Browse latest Browse all 14821

Trending Articles