Regex
Is a pattern-matching tool that is available in all languages. In this article, I am using C#. Let's assume you want to extract valuable information from any webpage for research purposes; then, you can use regex to extract the data from the parsed HTML.
In the following example, the user will enter the website into the textbox and then click the button to extract the information from the downloaded page. Code is self-explanatory.
<%@ Page Language="C#" AutoEventWireup="true" CodeFile="RegX.aspx.cs" Inherits="RegX" %>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head runat="server">
<title>Untitled Page</title>
</head>
<body>
<form id="form1" runat="server">
<div>
<asp:Panel ID="panUrl" runat="server" GroupingText="Search" CssClass="myPanel">
Enter Url:
<asp:TextBox ID="txtUrl" runat="server" Columns="50" />
<br />
<asp:Button ID="btnSearch" runat="server" OnClick="btnSearch_Click" Text="Search" />
</asp:Panel>
<p>
</p>
<asp:Panel ID="panHeadings" runat="server" GroupingText="Headings in this Url" CssClass="myPanel">
<asp:Literal ID="litContent" runat="server" />
</asp:Panel>
</div>
</form>
</body>
</html>
using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Text.RegularExpressions;
using System.Net;
public partial class RegX : System.Web.UI.Page
{
/// <summary>
/// Each time the page loads, empty the literal control
/// </summary>
protected void Page_Load(object sender, EventArgs e)
{
litContent.Text = "";
}
/// <summary>
/// Event handler for search button
/// </summary>
protected void btnSearch_Click(object sender, EventArgs e)
{
// Need to trap error in case of unresponsive URL
try
{
// Use WebClient to download content at URL into a string
WebClient client = new WebClient();
string content = client.DownloadString(txtUrl.Text);
// Match any of the H? tags
Regex reg = new Regex(@"<h\d>.+</h\d>",
RegexOptions.IgnoreCase);
// Get a collection of all the matches
MatchCollection mc = reg.Matches(content);
// Iterate through the collection of matches
foreach (Match m in mc)
{
// HTML encode the tag and display in literal
litContent.Text += HttpUtility.HtmlEncode(m.Value) +
"<br/>";
}
}
catch
{
litContent.Text = "Could not connect to " + txtUrl.Text;
}
}
}