본문 바로가기
개발노트/C#

[C#] 네이버 API를 활용하여 뉴스기사 웹크롤링 - HtmlAgilityPack 라이브러리

by lovvepearl 2024. 3. 22.

.html 파일 생성

//css_ref HtmlAgilityPack.dll;
//using HtmlAgilityPack;

string data1 = {검색 키워드}; //둘 이상의 키워드 합성 시, '+' 기호로 연결
string data2 = {검색 시작일};
string data3 = {검색 종료일};
string data4 = {페이지 번호};
string data5 = "3"; //pd

//API 생성
string url =
string.Format("https://search.naver.com/search.naver?where=news&query={0}&sm=tab_opt&sort=1&photo=0&field=0&pd={4}&ds={1}&de={2}&docid=&related=0&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so%3Add%2Cp%3a:all&start={3}"
, data1, data2, data3, data4, data5);

string check_xpath = "/html/body";
Encoding html_encoding = Encoding.UTF8;
Dictionary<string, string> html_dic = new Dictionary<string, string>();

html_dic.Clear();
html_dic["_html"] = "";
string html = "";

string result = "";
try
{
	HtmlWeb htmlWeb = null;
	HtmlAgilityPack.HtmlDocument htmlDoc = null;
	HtmlNode docHtmlNode = null;
	HtmlNode htmlNode = null;

	try
	{
		htmlWeb = new HtmlWeb();

		htmlDoc = htmlWeb.Load(url);
		if (htmlDoc == null){result = "웹페이지 로딩 실패";}

		docHtmlNode = htmlDoc.DocumentNode;
		if (docHtmlNode == null) {result = "문서 노드 미존재로 html생성 실패";}

		html_dic["_html"] = docHtmlNode.OuterHtml;

		html_encoding = htmlDoc.Encoding;

		if (check_xpath == "")
	   {
			html = docHtmlNode.OuterHtml;
		}
		else
		{
			htmlNode = htmlDoc.DocumentNode.SelectSingleNode(check_xpath);
			if (htmlNode == null) {result = "웹페이지 로딩 완료 확인 노드 찾기 실패";}

			html = htmlNode.OuterHtml;
		}
	}
	finally
	{
		htmlNode = null;
		docHtmlNode = null;
		htmlDoc = null;
		htmlWeb = null;
	}
}
catch (Exception ex)
{
	result = "예외발생으로 html생성 실패 : " + ex.Message;
}

if(result != "") {
	return result;
} else {
	string path = Path.Combine({폴더경로}, "{파일명}.html");
	System.IO.File.WriteAllText(path,html,Encoding.Default);
	return "성공";
}

 

생성된 html Parsing

//css_ref HtmlAgilityPack.dll;
//using HtmlAgilityPack;
//using System.Text.RegularExpressions;  

string path = Path.Combine({폴더경로}, "{파일명}.html");
string value = System.IO.File.ReadAllText(path,Encoding.Default);

HtmlAgilityPack.HtmlDocument HtmlDoc = new HtmlAgilityPack.HtmlDocument();
HtmlDoc.LoadHtml(value);

HtmlAgilityPack.HtmlNodeCollection nodeCol = HtmlDoc.DocumentNode.SelectNodes("//*[contains(@id,'sp_nws')]"); //신문 전체

string result = "";
foreach(HtmlAgilityPack.HtmlNode node in nodeCol) {
	//temp = node.OuterHtml;
	string news = node.SelectSingleNode("./div/div/div[1]/div[2]/a[1]").InnerText; //매체
	string day_temp = node.SelectSingleNode("./div/div/div[1]/div[2]/span").InnerText; //일자
	string link_t = node.SelectSingleNode("./div/div/div[2]/a[1]").Attributes["href"].Value; //링크
	string sub = node.SelectSingleNode("./div/div/div[2]/a[2]").Attributes["title"].Value; //제목
	string day = "";

	//일자표기 형식별 날짜 변환
    if(day_temp.Contains("일 전")){
    string strText = day_temp;
    string strNum = "";
    strNum = Regex.Replace(strText, @"\D", "");
    int num = Convert.ToInt32("-"+strNum);
    day=DateTime.Now.AddDays(num).ToString("yyyy-MM-dd");
    }

    if(day_temp.Contains("시간 전")){
    string strText = day_temp;
    string strNum = "";
    strNum = Regex.Replace(strText, @"\D", "");
    int num = Convert.ToInt32("-"+strNum);
    day=DateTime.Now.AddHours(num).ToString("yyyy-MM-dd");
    }

    if(day_temp.Contains("면")){
    string strText = node.SelectSingleNode("./div/div/div[1]/div[2]/span[2]").InnerText; //일자
    string strNum = "";
    strNum = Regex.Replace(strText, @"\D", "");
    int num = Convert.ToInt32("-"+strNum);
    day=DateTime.Now.AddHours(num).ToString("yyyy-MM-dd");
    }
    
	result+= day + "\t" +news +"\t"+ sub.Replace("&quot;","").Replace("&amp;","").Replace("&#39;","'") + "\t" +link_t + "\n";
}
   
string [] arr_title=result.Split("\n");
int arr_title_count=arr_title.Length-1;

string res="";
for(int j=0; j<arr_title_count; j++){
    if(j==0){
        res+=arr_title[j]+"\n";
        continue;
    }
    if(j>0){
        string title= arr_title[j].Split("\t")[2];
        string v_title=Regex.Replace(title, @"[^a-zA-Z0-9가-힣_]", "", RegexOptions.Singleline).Trim();
        int v_title_count=v_title.Length-5;
        v_title=v_title.Substring(v_title_count);

        string title_bef= arr_title[j-1].Split("\t")[2];
        string v_title_bef=Regex.Replace(title_bef, @"[^a-zA-Z0-9가-힣_]", "", RegexOptions.Singleline).Trim();
        int v_title_bef_count=v_title_bef.Length-5;
        v_title_bef=v_title_bef.Substring(v_title_bef_count);

        if(!v_title_bef.Contains(v_title))
        {
            res+=arr_title[j]+"\n";
        }
        else{
            continue;

        }

    }

        }

return res;