得到一个html网页上所有链接

日期：2024-12-16 作者：czdytfhm4

核心提示：'///////////////////////////ImportsSystem.IOImportsSystem.NetImportsSystemImportsSystem.TextImportsSystem.Text.RegularEx

'///////////////////////////
Imports System.IO
Imports System.Net
Imports System
Imports System.Text
Imports System.Text.RegularExpressions
Public Class HTMLContentParser
Function Return_HTMLContent(ByVal sURL As String)
Dim sStream As Stream
Dim URLReq As HttpWebRequest
Dim URLRes As HttpWebResponse
Try
URLReq = WebRequest.Create(sURL)
URLRes = URLReq.GetResponse()
sStream = URLRes.GetResponseStream()
Return New StreamReader(sStream).ReadToEnd()
Catch ex As Exception
Return ex.Message
End Try
End Function
Function ParseHTMLLinks(ByVal sHTMLContent As String, ByVal sURL As String) As ArrayList
Dim rRegEx As Regex
Dim mMatch As Match
Dim aMatch As New ArrayList()
rRegEx = New Regex("a.*hrefs*=s*(?:""(?<1>[^""]*)""|(?<1>S+))", _ RegexOptions.IgnoreCase Or RegexOptions.Compiled)
mMatch = rRegEx.Match(sHTMLContent)
While mMatch.Success
Dim sMatch As String
sMatch = ProcessURL(mMatch.Groups(1).ToString, sURL)
aMatch.Add(sMatch)
mMatch = mMatch.NextMatch()
End While
Return aMatch
End Function
Function ParseHTMLImages(ByVal sHTMLContent As String, ByVal sURL As String) As ArrayList
Dim rRegEx As Regex
Dim mMatch As Match
Dim aMatch As New ArrayList()
rRegEx = New Regex("img.*srcs*=s*(?:""(?<1>[^""]*)""|(?<1>S+))", _ RegexOptions.IgnoreCase Or RegexOptions.Compiled)
mMatch = rRegEx.Match(sHTMLContent)
While mMatch.Success
Dim sMatch As String
sMatch = ProcessURL(mMatch.Groups(1).ToString, sURL)
aMatch.Add(sMatch)
mMatch = mMatch.NextMatch()
End While
Return aMatch
End Function
Private Function ProcessURL(ByVal sInput As String, ByVal sURL As String)
'Find out if the sURL has a "/" after the Domain Name 'If not, give a "/" at the end 'First, check out for any slash after the 'Double Dashes of the http:// 'If there is NO slash, then end the sURL string with a SLASH If InStr(8, sURL, "/") = 0 Then
sURL += "/"
End If
'FILTERING
'Filter down to the Domain Name Directory from the Right
Dim iCount As Integer
For iCount = sURL.Length To 1 Step -1
If Mid(sURL, iCount, 1) = "/" Then
sURL = Left(sURL, iCount)
Exit For
End If
Next
'Filter out the ">" from the Left
For iCount = 1 To sInput.Length
If Mid(sInput, iCount, 4) = ">" Then
sInput = Left(sInput, iCount - 1) 'Stop and Take the Char before
Exit For
End If
Next
'Filter out unnecessary Characters
sInput = sInput.Replace("<", Chr(39))
sInput = sInput.Replace(">", Chr(39))
sInput = sInput.Replace(""", "")
sInput = sInput.Replace("'", "")
If (sInput.IndexOf("http://") < 0) Then
If (Not (sInput.StartsWith("/")) And Not (sURL.EndsWith("/"))) Then
Return sURL & "/" & sInput
Else
If (sInput.StartsWith("/")) And (sURL.EndsWith("/")) Then
Return sURL.Substring(0, sURL.Length - 1) + sInput
Else
Return sURL + sInput
End If
End If
Else
Return sInput
End If
End Function
End Class

本文地址：http://w.yusign.com/tjnews/870.html 述古往 http://w.yusign.com/static/ , 查看更多

特别提示：本信息由相关用户自行提供，真实性未证实，仅供参考。请谨慎采用，风险自负。

点赞 0举报收藏 0打赏 0

更多>同类生活信息

相关文章

最新文章

推荐文章

推荐图文

生活信息

点击排行

• 学科教学历史考研院校有哪些	• 百度停PC浏览器服务国研政情·经济信息智库：
• 更多安狗狗清理大师安卓历史版本（6个）	• 六款超级神器！Markdown编辑器、Windows系统安
• 孝感爱采购标题撰写	• 日照网站建设排名优化，打造高效、用户友好的网
• 漫蛙App官网入口及使用指南：全方位探索动漫世	• 湖北亿纬动力取得一种测试用夹持装置专利，能保
• 明天有小雪，气温下降，雪后风大！	• 天正三亚湾壹号-售楼中心丨楼盘解析丨户型、价