WEBページクローラーの基盤構築を考えていた時に、nodejsを利用するのは最近のトレンドなので、とりあえず波に乗ってみたいと思います。
ただ、今回はクローラーの基本として、単一ページのリンク一覧を取得する事を目的とします。
構成
bootstrapをベースに簡単なフレームワークを作ってますが、静的ページでも動作します。
nodejsのクロール用プログラムと、ページからnodejsプログラムにアクセスするプログラムの2種類用意しました。
/**
* SiteMap
**/
// Config-----
var flgTarget = "layer";//[ none , domain , layer ]
var urls = [];
// Init-----
var http = require('http');
var FS = require('fs');
var QS = require('querystring');
var saveDir = "data/index/sitemap/";
var accessPort = 3334;
http.createServer(function(req,res){
res.writeHead(200,{
'Content-Type':'text/plain',
'Access-Control-Allow-Origin':'*'
});
$$saveDir(saveDir);
//PostString
var postData='';
req.on('data', function (data) {
postData += unescape(data);
});
req.on('end',function(){
var POST = QS.parse(postData);
$$SPK(POST,res);
});
}).listen(accessPort);
console.log('Server running at http://***:'+accessPort+'/');
//-----
//Library
(function(){
var SPK = require('spooky');
var $$=function(data,res){
var dt = (+new Date());
var spooky = new SPK({
casper:{
logLevel:'debug',
verbose:true,
sslProtocol:'any'
},
child:{
"ssl-protocol":"tlsv1",
"ignore-ssl-errors":true
}
},function(){
spooky.create({viewportSize:{width:data.width,height:data.height}});
spooky.userAgent(data.ua);
spooky.start(data.url);
spooky.then([{url:data.url},function(data){
var links = this.evaluate(function(url){
var links = document.links;
var lists = [url];
for(var i=0;i<links.length;i++){
//if(lists.length && lists.indexOf(links[i])!=-1){continue}
lists.push(links[i].href);
}
return lists;
}, url);
this.emit("checkUrls",links);
}]);
spooky.run();
spooky.on("checkUrls",function(links){
if(links.length==0){return}
var url = data.url;
url = url.replace(/\//g,"\\/");
url = url.replace(/\./g,"\\.");
url = url.replace(/\:/g,"\\:");
url = url.replace(/\-/g,"\\-");
var reg = new RegExp("^"+url,"i");
for(var i=0;i<links.length;i++){
var href = links[i].split("#")[0];
if(urls.length && urls.indexOf(href)!=-1){continue}
//if(flgTarget=="layer" && !href.match(reg)){continue}
urls.push(href);
}
var json = JSON.stringify({counts:urls.length,urls:urls});
res.end(json);
//res.end(urls.join("\n")+"\nurl-count:"+urls.length);
});
});
};
$$SPK = $$;
})();
// Save-Directory
(function(FS){
var $$={};
$$=function(path){
var paths = path.split("/");
var dir = "";
for(var i=0;i<paths.length;i++){
dir += paths[i]+"/";
//check
if(FS.existsSync(dir)){continue}
//make
FS.mkdirSync(dir,0755);
}
};
$$saveDir = $$;
})(FS);
$ node getSitemap.js
Server running at http://***:3334/
(function(){
var $$={};
var port = "3334";
$$.start = function(){
var btn = document.getElementById("btn");
btn.onclick = function(){
var url = document.getElementById("url");
$$.access(url.value,[]);
};
};
$$.access = function(targetUrl,listUrl){
if(!targetUrl){return}
document.getElementById("access").innerHTML = targetUrl;
document.getElementById("url-loading").style.setProperty("display","inline","");
//var ua = document.getElementById("ua");
var location_hrefs = location.href.split("/");
var nodeAccess = "http://"+location_hrefs[2]+":"+port+"/"+"?url="+escape(targetUrl);
var dir = $$.LIB.dirname(location_hrefs.join("/"));
$$.ajax.set({
url:nodeAccess,
method:"post",
async:true,//同期
query:{
url:escape(targetUrl),
dir:dir
},
option:{
listMax:document.form1.max.value,
listUrl:listUrl,
startTime:(+new Date())
},
onSuccess:$$.collbackSuccess
});
};
$$.collbackSuccess = function(res){
var json = JSON.parse(res);
var urls = $$.urlsNarrow(json.urls,this.query.url,this.option.listUrl,this.option.listMax);
document.getElementById("counts").innerHTML = urls.length+"("+json.counts+")";
document.getElementById("preview").innerHTML = urls.join("<br>\n");
document.getElementById("url-loading").style.setProperty("display","none","");
//this.option.listUrl = urls;
//
// for(var i=1;i<this.option.listUrl.length;i++){
// $$.access(this.option.listUrl[i],this.option.listUrl);
// }
// time
document.getElementById("times").innerHTML = ((+new Date()) - this.option.startTime)/1000;
};
//first-time
$$.urlsNarrow = function(urls,condition,listUrl,listMax){
var condition_url = $$.urlEscape($$.delPort(condition));
//var newUrls = [];
var reg = new RegExp("^"+condition_url,"i");
for(var i=0;i<urls.length;i++){
if(listMax > 0 && listUrl.length >= listMax){continue}
var currentUrl = $$.delPort(urls[i]);
//domain-hieraruky-check
if(!currentUrl.match(reg)){continue}
//unique-check
if(listUrl.length && listUrl.indexOf(urls[i])!=-1){continue}
listUrl.push(urls[i]);
}
return listUrl;
};
$$.delPort = function(url){
url = unescape(url);
var newUrl = url;
if(newUrl.match(/^http:/i)){
newUrl = url.replace("http:","");
}
else if(newUrl.match(/^https:/i)){
newUrl = url.replace("https:","");
}
return newUrl;
};
$$.urlEscape = function(url){
url = url.replace(/\//g,"\\/");
url = url.replace(/\./g,"\\.");
url = url.replace(/\[/g,"\\[");
url = url.replace(/\]/g,"\\]");
url = url.replace(/\$/g,"\\$");
url = url.replace(/\^/g,"\\^");
url = url.replace(/\-/g,"\\-");
url = url.replace(/\{/g,"\\{");
url = url.replace(/\}/g,"\\}");
url = url.replace(/\(/g,"\\(");
url = url.replace(/\)/g,"\\)");
return url;
};
//repeat-time
$$.LIB = {
basename:function(path){
return path.replace(/\//g,'/').replace( /.*\//, '');
},
dirname:function(path){
return path.replace(/\//g,'/').replace(/\/[^/]*$/, '');
}
};
$$.ajax = {
xmlObj:function(f){
var r=null;
try{r=new XMLHttpRequest()}
catch(e){
try{r=new ActiveXObject("Msxml2.XMLHTTP")}
catch(e){
try{r=new ActiveXObject("Microsoft.XMLHTTP")}
catch(e){return null}
}
}
return r;
},
/**
* XMLHttpRequestオブジェクト生成
*/
set:function(option){
if(!option){return}
$$.ajax.httpoj = $$.ajax.createHttpRequest();
if(!$$.ajax.httpoj){return;}
$$.ajax.httpoj.open(option.method , option.url , option.async);
$$.ajax.httpoj.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded');
//受信時に起動するイベント;
$$.ajax.httpoj.onreadystatechange = function(){
//readyState値は4で受信完了;
if (this.readyState==4){
option.onSuccess(this.responseText);
}
};
//query整形:querys
var data = [];
if(typeof(option.query)!="undefined"){
for(var i in option.query){
data.push(i+"="+encodeURIComponent(option.query[i]));
}
}
if(typeof(option.querys)!="undefined"){
for(var i=0;i<option.querys.length;i++){
data.push(option.querys[i][0]+"="+encodeURIComponent(option.querys[i][1]));
}
}
//send メソッド
if(data.length){
$$.ajax.httpoj.send(data.join("&"));
}
else{
$$.ajax.httpoj.send();
}
},
createHttpRequest:function(){
//Win ie用
if(window.ActiveXObject){
//MSXML2以降用
try {return new ActiveXObject("Msxml2.XMLHTTP")}
catch(e){
//旧MSXML用
try{return new ActiveXObject("Microsoft.XMLHTTP")}
catch(e2){return null}
}
}
//Win ie以外のXMLHttpRequestオブジェクト実装ブラウザ用
else if(window.XMLHttpRequest){return new XMLHttpRequest()}
else{return null}
}
};
$$.start();
})();
0 件のコメント:
コメントを投稿