C# 抓取网页img的src带参数的图片链接，并下载

admin

2021年3月18日 12:8 本文热度 3865

using System;

using System.Collections.Generic;

using System.ComponentModel;

using System.Data;

using System.Drawing;

using System.IO;

using System.Linq;

using System.Net;

using System.Text;

using System.Text.RegularExpressions;

using System.Threading;

using System.Windows.Forms;

namespace ImageCollection

    public partial class Form1 : Form

        private static string Path = AppDomain.CurrentDomain.BaseDirectory + "img";

        public Form1()

            InitializeComponent();

        private void btnshuaqu_Click(object sender, EventArgs e)

            string url = txturl.Text.Trim();

            if (string.IsNullOrEmpty(url))

                MessageBox.Show("请输入URl");

                return;

            txtimg.AppendText("开始抓取中:\r\n");

            Thread th = new Thread(() => ShuaQu(url)) { IsBackground = true };

            th.Start();

        private void ShuaQu(string url)

            DirectoryInfo di = new DirectoryInfo(Path);

            if (System.IO.Directory.Exists(Path))

                di.Delete(true);

            System.IO.Directory.CreateDirectory(Path);

            string result = WebHttp.HttpGet(url, null, 3);

            string[] str = GetHtmlImageUrlList(result);

            txtimg.Invoke(new Action(() =>

                txtimg.AppendText("已经获取到数据!"+str.Count() + "\r\n");

            }));

            //建立获取网页标题正则表达式

            String regex = @"<title>.+</title>";

            //返回网页标题

            String title = Regex.Match(result, regex).ToString();

            txttitle.Invoke(new Action(() => {

                txttitle.Text = Regex.Replace(title, @"[\""]+", "");

            }));

            foreach (string s in str)

                Uri u = new Uri(s);

                if (u.Host == "www.xxx.com")

                    Thread downimg = new Thread(() => Get_img(s)) { IsBackground = true };

                    downimg.Start();

                    txtimg.Invoke(new Action(() => {

                        txtimg.AppendText(s + "\r\n");

                    }));

            txtimg.Invoke(new Action(() =>

                txtimg.AppendText("全部抓取完成!\r\n");

            }));

        public void Get_img(string imgpath)

            string[] file = imgpath.Split('?');

            string name = System.IO.Path.GetFileName(file[0]);

            WebClient mywebclient = new WebClient();

            mywebclient.DownloadFile(imgpath, Path + @"\" + name);

            //Bitmap img = null;

            //HttpWebRequest req;

            //HttpWebResponse res = null;

            //try

//{

            //    System.Uri httpUrl = new System.Uri(imgpath);

            //    req = (HttpWebRequest)(WebRequest.Create(httpUrl));

            //    req.Timeout = 180000; //设置超时值10秒

            //    req.UserAgent = "XXXXX";

            //    req.Accept = "XXXXXX";

            //    req.Method = "GET";

            //    res = (HttpWebResponse)(req.GetResponse());

            //    img = new Bitmap(res.GetResponseStream());//获取图片流

            //    img.Save(Path + @"\"+name);//随机名

//}

            //catch (Exception ex)

//{

            //    string aa = ex.Message;

//}

            //finally

//{

            //    res.Close();

//}

        /// <summary>

        /// 取得HTML中所有图片的 URL。

        /// </summary>

        /// <param name="sHtmlText">HTML代码</param>

        /// <returns>图片的URL列表</returns>

        private string[] GetHtmlImageUrlList(string sHtmlText)

            // 定义正则表达式用来匹配 img 标签

            Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);

            // 搜索匹配的字符串

            MatchCollection matches = regImg.Matches(sHtmlText);

            int i = 0;

            string[] sUrlList = new string[matches.Count];

            // 取得匹配项列表

            foreach (Match match in matches)

                sUrlList[i++] = match.Groups["imgUrl"].Value;

            return sUrlList;

#region 下载图片到Image

public static Image UrlToImage(string url) {

    WebClient mywebclient = new WebClient();

    byte[] Bytes = mywebclient.DownloadData(url);

    using (MemoryStream ms = new MemoryStream(Bytes)) {

        Image outputImg = Image.FromStream(ms);

        return outputImg;

#endregion

该文章在 2021/3/18 12:09:33 编辑过

关键字查询

正在查询...

点晴ERP是一款针对中小制造业的专业生产管理软件系统,系统成熟度和易用性得到了国内大量中小企业的青睐。

点晴PMS码头管理系统主要针对港口码头集装箱与散货日常运作、调度、堆场、车队、财务费用、相关报表等业务管理，结合码头的业务特点，围绕调度、堆场作业而开发的。集技术的先进性、管理的有效性于一体，是物流码头及其他港口类企业的高效ERP管理信息系统。

点晴WMS仓储管理系统提供了货物产品管理,销售管理,采购管理,仓储管理,仓库管理,保质期管理,货位管理,库位管理,生产管理,WMS管理系统,标签打印,条形码,二维码管理,批号管理软件。

点晴免费OA是一款软件和通用服务都免费，不限功能、不限时间、不限用户的免费OA协同办公管理系统。