@inproceedings{1a1d8696a117452db7a0c5ff84555491,
title = "On-the-fly detection of content-poor webpaths",
abstract = "Web page crawling is an essential part of a web search engine. As the number of web pages in the Web is so big, it's practically impossible for a search engine to cover all web pages. An important question for the search engine is then {"}Which web pages should be crawled and indexed ?{"}. In our observation, we found that most of the index-worthless web pages in a web site are in a same directory or generated by a same CGI program. We use webpath to denote the set of web pages residing in a same directory or generated by a same CGI program and we call it a content-poor webpath if it contains mostly index-worthless web pages. In this paper, we present an approach to detect the content poor webpaths on the fly, such that the crawler can improve the quality of the data crawling. We use statistical approach by analyzing URL patterns and page content structures in the crawled pages to decide whether a webpath is content poor. Our experimental results show that, given a fixed time interval, the data crawler with content-poor webpath filtering will produce a search index that has approximately 10\% of search result improvement, compared to the original crawler without the filter. The precision of detection is exceeding 90\%.",
keywords = "Content filter, Web crawler",
author = "Hsu, \{Ting Chao\} and Chang, \{Hsien Tsung\} and Sun Wu",
year = "2006",
language = "英语",
isbn = "9780889865778",
series = "Proceedings of the Second IASTED International Conference on Web Technologies, Applications, and Services, WTAS 2006",
pages = "197--203",
booktitle = "Proceedings of the Second IASTED International Conference on Web Technologies, Applications, and Services, WTAS 2006",
note = "2nd IASTED International Conference on Web Technologies, Applications, and Services, WTAS 2006 ; Conference date: 17-07-2006 Through 19-07-2006",
}