`
dcdc723
  • 浏览: 182995 次
  • 性别: Icon_minigender_1
  • 来自: 北京
社区版块
存档分类
最新评论

用C#写的一个采集某大型B2C数据的程序(数据采集)

    博客分类:
  • net
阅读更多
using System;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
namespace web
{
    public partial class dang : System.Web.UI.Page
    {
        protected void Page_Load(object sender, EventArgs e)
        {

            WebClient wc = new WebClient();
            byte[] bt = wc.DownloadData(@"网址");
            string res = Encoding.Default.GetString(bt);
            res = Regex.Replace(res, @"<!DOCTYPE .*?第1页", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
            res = Regex.Replace(res, @"<!--页尾 开始 -->.*?</html>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
            res = Regex.Replace(res, @"<div id=""divBottomPageNavi"".*?</div>.*?</div>.*?</div>.*?</div>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
            res = Regex.Replace(res, @"</div><div class='list_r_title_text3a'>.*?list_r_line""></div>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
            res = Regex.Replace(res, @"<div class=""clear"">.*?", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);

            res = Regex.Replace(res, @"<div class=""list_r_list"">.*?<h2>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
            res = Regex.Replace(res, @"</h2>.*?</div>.*?</div>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
            res = Regex.Replace(res, @"<a name=""link_prd_name"" href='", "");
            res = Regex.Replace(res, @"' target=""_blank"">", "ww");
            res = Regex.Replace(res, @"ww.*?</a>", "");
            res = Regex.Replace(res, "</div>", ";");
            res = res.Substring(0, res.Length - 1).ToString();
            string[] ress = res.Split(';');
            for (int i = 0; i < ress.Length - 1; i++)
            {

                WebClient wc1 = new WebClient();
                string ur = @"" + ress[i] + "";
                byte[] bt1 = wc.DownloadData(ur);
                string res1 = Encoding.Default.GetString(bt1);
                res1 = Regex.Replace(res1, @"<!DOCTYPE .*?您最近的浏览历史", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
                res1 = Regex.Replace(res1, @"<h2 class=""black14"">.*?<!--价格购买区结束-->", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
                res1 = Regex.Replace(res1, @"<a name=""review_point""></a>.*?</html>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);


                // Response.Write(res1);
                Regex chubantime = new Regex(@"<li>出版时间:.*?</li>");
                if (chubantime.IsMatch(res1))
                {
                    string chubantime1 = chubantime.Match(res1).Value.ToString();
                    chubantime1 = Regex.Replace(chubantime1, "<li>出版时间:", "");
                    chubantime1 = Regex.Replace(chubantime1, "</li>", "");
                    Response.Write(chubantime1.Trim() + "<br>");
                }
                Regex zishu = new Regex(@"<li>字  数:.*?</li>");
                if (zishu.IsMatch(res1))
                {
                    string zishu1 = zishu.Match(res1).Value.ToString();
                    zishu1 = Regex.Replace(zishu1, "<li>字  数:", "");
                    zishu1 = Regex.Replace(zishu1, "</li>", "");
                    Response.Write(zishu1.Trim() + "<br>");
                }
                Regex banci = new Regex(@"<li>版  次:.*?</li>");
                if (banci.IsMatch(res1))
                {
                    string banci1 = banci.Match(res1).Value.ToString();
                    banci1 = Regex.Replace(banci1, "<li>版  次:", "");
                    banci1 = Regex.Replace(banci1, "</li>", "");
                    Response.Write(banci1.Trim() + "<br>");
                }
                Regex yeshu = new Regex(@"<li>页  数:.*?</li>");
                if (yeshu.IsMatch(res1))
                {
                    string yeshu1 = yeshu.Match(res1).Value.ToString();
                    yeshu1 = Regex.Replace(yeshu1, "<li>页  数:", "");
                    yeshu1 = Regex.Replace(yeshu1, "</li>", "");
                    Response.Write(yeshu1.Trim() + "<br>");
                }
                Regex yshsj = new Regex(@"<li>印刷时间:.*?</li>");
                if (yshsj.IsMatch(res1))
                {
                    string yshsj1 = yshsj.Match(res1).Value.ToString();
                    yshsj1 = Regex.Replace(yshsj1, "<li>印刷时间:", "");
                    yshsj1 = Regex.Replace(yshsj1, "</li>", "");
                    Response.Write(yshsj1.Trim() + "<br>");
                }
                Regex kaiben = new Regex(@"<li>开  本:.*?</li>");
                if (kaiben.IsMatch(res1))
                {
                    string kaiben1 = kaiben.Match(res1).Value.ToString();
                    kaiben1 = Regex.Replace(kaiben1, "<li>开  本:", "");
                    kaiben1 = Regex.Replace(kaiben1, "</li>", "");
                    Response.Write(kaiben1.Trim() + "<br>");
                }





                Regex yinci = new Regex(@"<li>印  次:.*?</li>");
                if (yinci.IsMatch(res1))
                {
                    string yinci1 = yinci.Match(res1).Value.ToString();
                    yinci1 = Regex.Replace(yinci1, "<li>印  次:", "");
                    yinci1 = Regex.Replace(yinci1, "</li>", "");
                    Response.Write(yinci1.Trim() + "<br>");
                }


                Regex zhizhang = new Regex(@"<li纸  张:.*?</li>");
                if (zhizhang.IsMatch(res1))
                {
                    string zhizhang1 = zhizhang.Match(res1).Value.ToString();
                    zhizhang1 = Regex.Replace(zhizhang1, "<li>纸  张:", "");
                    zhizhang1 = Regex.Replace(zhizhang1, "</li>", "");
                    Response.Write(zhizhang1.Trim() + "<br>");
                }

                Regex isbnn = new Regex(@"<li>I S B N    :.*?</li>");
                if (isbnn.IsMatch(res1))
                {
                    string isbnn1 = isbnn.Match(res1).Value.ToString();
                    isbnn1 = Regex.Replace(isbnn1, "<li>I S B N    :", "");
                    isbnn1 = Regex.Replace(isbnn1, "</li>", "");
                    Response.Write(isbnn1.Trim() + "<br>");
                }

                Regex baozhuang = new Regex(@"<li>包  装:.*?</li>");
                if (baozhuang.IsMatch(res1))
                {
                    string baozhuang1 = baozhuang.Match(res1).Value.ToString();
                    baozhuang1 = Regex.Replace(baozhuang1, "<li>包  装:", "");
                    baozhuang1 = Regex.Replace(baozhuang1, "</li>", "");
                    Response.Write(baozhuang1.Trim() + "<br>");
                }

                Regex chubanshe = new Regex(@"<div id='publisher_'>出 版 社:.*?</div>");
                if (chubanshe.IsMatch(res1))
                {
                    string chubanshe1 = chubanshe.Match(res1).Value.ToString();
                    chubanshe1 = Regex.Replace(chubanshe1, "<div id='publisher_'>出 版 社:", "");
                    chubanshe1 = Regex.Replace(chubanshe1, "</div>", "");
                    Response.Write(chubanshe1.Trim() + "<br>");
                }

                Regex zuozhe = new Regex(@"<div id='author_' >作  者:.*?</div>");
                if (zuozhe.IsMatch(res1))
                {
                    string zuozhe1 = zuozhe.Match(res1).Value.ToString();
                    zuozhe1 = Regex.Replace(zuozhe1, "<div id='author_' >作  者:", "");
                    zuozhe1 = Regex.Replace(zuozhe1, "</div>", "");
                    Response.Write(zuozhe1.Trim() + "<br>");
                }

                Regex dingjia = new Regex(@"<span class=""gray87"">定价:<span class=""del"">.*?</span></span>");
                if (dingjia.IsMatch(res1))
                {
                    string dingjia1 = dingjia.Match(res1).Value.ToString();
                    dingjia1 = Regex.Replace(dingjia1, @"<span class=""gray87"">定价:<span class=""del"">", "");
                    dingjia1 = Regex.Replace(dingjia1, "</span></span>", "");
                    Response.Write(dingjia1.Trim() + "<br>");
                }

                Regex jiage = new Regex(@"<span class=""redc30"">价格:.*?</b></span>");
                if (jiage.IsMatch(res1))
                {
                    string jiage1 = jiage.Match(res1).Value.ToString();
                    jiage1 = Regex.Replace(jiage1, @"<span class=""redc30"">价格:<b>", "");
                    jiage1 = Regex.Replace(jiage1, "</b></span>", "");
                    Response.Write(jiage1.Trim() + "<br>");
                }


                Regex neirong = new Regex(@"内容简介</h2> <div class=""right_content"">.*?</div><div class=""dashed"">");
                if (neirong.IsMatch(res1))
                {
                    string neirong1 = neirong.Match(res1).Value.ToString();
                    neirong1 = Regex.Replace(neirong1, @"内容简介</h2> <div class=""right_content"">", "");
                    neirong1 = Regex.Replace(neirong1, @"</div><div class=""dashed"">", "");
                    Response.Write(neirong1.Trim() + "<br>");
                }

                Regex mulu = new Regex(@"目录</h2> <div class=""right_content"">.*?</div>");
                if (mulu.IsMatch(res1))
                {
                    string mulu1 = mulu.Match(res1).Value.ToString();
                    mulu1 = Regex.Replace(mulu1, @"目录</h2> <div class=""right_content"">", "");
                    mulu1 = Regex.Replace(mulu1, "</div>", "");
                    Response.Write(mulu1.Trim() + "<br>");
                }

                Regex phs = new Regex(@"<img src="".*?id=""img_show_prd""/>");
                if (phs.IsMatch(res1))
                {
                    string phs1 = phs.Match(res1).Value.ToString();
                    phs1 = Regex.Replace(phs1, @"<img src=""", "");
                    phs1 = Regex.Replace(phs1, @"""  id=""img_show_prd""/>", ""); //小图
                   string phsname = Regex.Replace(phs1,@"http.*?\.com/\d.*/\d.*/",string.Empty,RegexOptions.IgnoreCase|RegexOptions.Singleline);
                    Response.Write("图片名"+phsname+"<br>");
                    string phbb = phs1.Substring(0, phs1.Length - 5) + "o.jpg"; //大图
                    //WebClient WCs = new WebClient();
                    //WebClient WCb = new WebClient();
                    //WCp.DownloadFile(@"http://www.XXX.com/img/XXX_logo.gif", Server.MapPath("XX.gif"));
                    //WCs.DownloadFile(@"""phs1""",Server.MapPath(""));
                    //WCs.DownloadFile(@"""phbb""");
                    Response.Write(phs1 + "<br>");
                    Response.Write(phbb + "<br>");
                }

                //Regex phb = new Regex(@"<a class=""gray878787a"" href=""javascript:ImgBtnChgPrd_Click\(this,'(.*?)'\)"" name=""bigpicture"">点击查看大图</a></div>");
                //if (phb.IsMatch(res1))
                //{
                //    string  phb1 = phs.Match(res1).Value.ToString();
                //        phb1 = Regex.Replace(phb1, @"this,'", "");
                //        phb1 = Regex.Replace(phb1, @"'\)"" name=""bigpicture"">点击查看大图</a></div>", ""); //大图
                //    Response.Write(phb1);
                //}


            }





            WebClient WCp = new WebClient();
            WCp.DownloadFile(@"http://www.XXX.com/img/XXX_logo.gif", Server.MapPath("XXX.gif"));  //这里是用WEBCLIENT保存图片

        }
    }
}

 

此处只列出了.CS文件

1
1
分享到:
评论
1 楼 dcdc723 2010-01-29  
通过后台创建Javascript到前台
建一个方法
StringBuilder createJSScript = new StringBuilder("var intIndex=0;arrList = new Array(); "); ;
createJSScript.Append(" function smanPromptList(arrList,objInputId){ \n");
        createJSScript.Append("\n");
ClientScript.RegisterClientScriptBlock(this.GetType(), "arrList", createJSScript.ToString(), true);
再在page_load事件里调用这个方法

相关推荐

Global site tag (gtag.js) - Google Analytics