~grubng-dev/grubng/tools-urlsdb

14 by thindil
added better URL's parser
1
//  
27 by thindil
added removing PHPSESSID from urls
2
//  Copyright (C) 2009,2010 Bartek thindil Jasicki
14 by thindil
added better URL's parser
3
// 
4
//  This file is part of Grubng
5
// 
6
//  Grubng is free software: you can redistribute it and/or modify
7
//  it under the terms of the GNU General Public License as published by
8
//  the Free Software Foundation, either version 3 of the License, or
9
//  (at your option) any later version.
10
// 
11
//  This program is distributed in the hope that it will be useful,
12
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
13
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
//  GNU General Public License for more details.
15
// 
16
//  You should have received a copy of the GNU General Public License
17
//  along with this program.  If not, see <http://www.gnu.org/licenses/>.
18
// 
19
20
using System;
21
using System.Text;
27 by thindil
added removing PHPSESSID from urls
22
using System.Text.RegularExpressions;
14 by thindil
added better URL's parser
23
24
namespace urlsdb
25
{
26
	
27
	/// <summary>
28
	/// Class provide functions for manipulate URL's
29
	/// </summary>
29 by thindil
some code optimization
30
	internal sealed class ParseURLs
14 by thindil
added better URL's parser
31
	{
32
		System.Security.Cryptography.SHA1Managed SHhash;
33
		StringBuilder key;
34
		byte[] HashValue;
35
		byte[] PureHash;
36
		
37
		/// <summary>
38
		/// Standard class constructor.
39
		/// </summary>
40
		public ParseURLs()
41
		{
42
			this.SHhash = new System.Security.Cryptography.SHA1Managed();
43
			this.key = new StringBuilder();
44
		}
45
		
46
		/// <summary>
47
		/// Function manipulate URL - check it correctness, drop to lower case host.
48
		/// </summary>
49
		/// <param name="url">
50
		/// A <see cref="System.String"/> URL to parse.
51
		/// </param>
52
		/// <returns>
53
		/// A <see cref="System.String"/> parsed URL or String.Empty if URL is invalid.
54
		/// </returns>
29 by thindil
some code optimization
55
		public static string ParseURL(string url)
14 by thindil
added better URL's parser
56
		{
57
			//Remove http(s):// from URL
58
			int index = url.IndexOf("//");
59
			if (index > 0)
60
			{
61
				url = url.Remove(0, (index + 2));
62
			}
63
			//Check correctness of URL
27 by thindil
added removing PHPSESSID from urls
64
			if (!Regex.IsMatch(url, @"^[a-z0-9]+([a-z0-9\=\-\.\?\,\'\/\\\+&amp;%\$#_~]*)?$", RegexOptions.IgnoreCase))
14 by thindil
added better URL's parser
65
			{
66
				return String.Empty;
67
			}
68
			//Conver to lower case host name
69
			index = url.IndexOf('/');
70
			string host, path;
71
			if (index > 0)
72
			{
73
				host = url.Substring(0, index);
16 by thindil
fixed bug with parse URL's
74
				path = url.Substring(index);
14 by thindil
added better URL's parser
75
			}
76
			else
77
			{
78
				host = url;
79
				path = "/";
80
			}
81
			host = host.ToLower();
19 by thindil
added validation hostname
82
			//Validate hostname
27 by thindil
added removing PHPSESSID from urls
83
			if (!Regex.IsMatch(host, @"^[a-z0-9]+([a-z0-9\-\.]*)?$", RegexOptions.IgnoreCase))
19 by thindil
added validation hostname
84
			{
85
				return String.Empty;
86
			}
27 by thindil
added removing PHPSESSID from urls
87
			//Remove PHPSESID from path
88
			if (path.Length > 1)
89
			{
29 by thindil
some code optimization
90
				path = Regex.Replace(path, @"(&amp;|\?|&)*PHPSESSID=\w+", String.Empty, RegexOptions.IgnoreCase);
27 by thindil
added removing PHPSESSID from urls
91
			}
14 by thindil
added better URL's parser
92
			url = host + path;
93
			url = System.Web.HttpUtility.UrlEncode(host + path);
94
			url = url.Replace("%2f", "/");
16 by thindil
fixed bug with parse URL's
95
			url = url.Replace("%2c", ",");
96
			url = url.Replace("%3d", "=");
97
			url = url.Replace("%3f", "?");
98
			url = url.Replace("%26", "&");
99
			url = url.Replace("%3b", ";");
100
			url = url.Replace("%3a", ":");
101
			url = url.Replace("%40", "@");
102
			url = url.Replace("%23", "#");
103
			url = url.Replace("%7e", "~");
18 by thindil
added one more replace option to ParseURL function
104
			url = url.Replace("%252f", "/");
14 by thindil
added better URL's parser
105
			return url;
106
		}
107
		
108
		/// <summary>
109
		/// Function return SHA1 hash of URL
110
		/// </summary>
111
		/// <param name="url">
112
		/// A <see cref="System.String"/> URL to hash
113
		/// </param>
114
		/// <returns>
115
		/// A <see cref="System.String"/> SHA1 hash of URL
116
		/// </returns>
117
		public string GetHash(string url)
118
		{
119
			if (this.key.Length > 0)
120
			{
121
				this.key.Remove(0, key.Length);
122
			}
123
			this.PureHash = Encoding.UTF8.GetBytes(url);
124
			this.HashValue = SHhash.ComputeHash(this.PureHash);
125
			foreach(byte b in this.HashValue) 
126
			{
127
				this.key.Append(String.Format("{0:x2}", b));
128
			}
129
			return this.key.ToString();
130
		}
131
	}
132
}