~grubng-dev/grubng/tools-urlsdb

« back to all changes in this revision

Viewing changes to ParseURLs.cs

  • Committer: thindil
  • Date: 2009-10-07 09:37:05 UTC
  • Revision ID: thindil2@gmail.com-20091007093705-kp67qhweaxd1prmw
added process rename

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
//  
2
 
//  Copyright (C) 2009,2010,2011 Bartek thindil Jasicki
3
 
// 
4
 
//  This file is part of Grubng
5
 
// 
6
 
//  Grubng is free software: you can redistribute it and/or modify
7
 
//  it under the terms of the GNU General Public License as published by
8
 
//  the Free Software Foundation, either version 3 of the License, or
9
 
//  (at your option) any later version.
10
 
// 
11
 
//  This program is distributed in the hope that it will be useful,
12
 
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 
//  GNU General Public License for more details.
15
 
// 
16
 
//  You should have received a copy of the GNU General Public License
17
 
//  along with this program.  If not, see <http://www.gnu.org/licenses/>.
18
 
// 
19
 
 
20
 
using System;
21
 
using System.Text;
22
 
using System.Text.RegularExpressions;
23
 
 
24
 
namespace urlsdb
25
 
{
26
 
        
27
 
        /// <summary>
28
 
        /// Class provide functions for manipulate URL's
29
 
        /// </summary>
30
 
        internal sealed class ParseURLs : IDisposable
31
 
        {
32
 
                /// <summary>
33
 
                /// Used to generate SHA256 hash of URL
34
 
                /// </summary>
35
 
                System.Security.Cryptography.SHA1Managed SHhash;
36
 
                /// <summary>
37
 
                /// SHA256 hash of URL
38
 
                /// </summary>
39
 
                StringBuilder key;
40
 
                /// <summary>
41
 
                /// Used to generate SHA256 hash of URL
42
 
                /// </summary>
43
 
                byte[] HashValue;
44
 
                /// <summary>
45
 
                /// Used to generate SHA256 hash of URL
46
 
                /// </summary>
47
 
                byte[] PureHash;
48
 
                /// <summary>
49
 
                /// If true, all disposable fields are disposed.
50
 
                /// </summary>
51
 
                bool disposed;
52
 
                
53
 
                /// <summary>
54
 
                /// Standard class constructor.
55
 
                /// </summary>
56
 
                public ParseURLs()
57
 
                {
58
 
                        this.SHhash = new System.Security.Cryptography.SHA1Managed();
59
 
                        this.key = new StringBuilder();
60
 
                }
61
 
                
62
 
                /// <summary>
63
 
                /// Function manipulate URL - check it correctness, drop to lower case host.
64
 
                /// </summary>
65
 
                /// <param name="url">
66
 
                /// A <see cref="System.String"/> URL to parse.
67
 
                /// </param>
68
 
                /// <returns>
69
 
                /// A <see cref="System.String"/> parsed URL or String.Empty if URL is invalid.
70
 
                /// </returns>
71
 
                public static string ParseURL(string url)
72
 
                {
73
 
                        //Remove http(s):// from URL
74
 
                        int index = url.IndexOf("//");
75
 
                        if (index > 0)
76
 
                        {
77
 
                                url = url.Remove(0, (index + 2));
78
 
                        }
79
 
                        //Check correctness of URL
80
 
                        if (!Regex.IsMatch(url, @"^[a-z0-9]+([a-z0-9\=\-\.\?\,\'\/\\\+&amp;%\$#_~]*)?$", RegexOptions.IgnoreCase))
81
 
                        {
82
 
                                return String.Empty;
83
 
                        }
84
 
                        //Conver to lower case host name
85
 
                        index = url.IndexOf('/');
86
 
                        string host, path;
87
 
                        if (index > 0)
88
 
                        {
89
 
                                host = url.Substring(0, index);
90
 
                                path = url.Substring(index);
91
 
                        }
92
 
                        else
93
 
                        {
94
 
                                host = url;
95
 
                                path = "/";
96
 
                        }
97
 
                        host = host.ToLower();
98
 
                        //Validate hostname
99
 
                        if (!Regex.IsMatch(host, @"^[a-z0-9]+([a-z0-9\-\.]*)?$", RegexOptions.IgnoreCase))
100
 
                        {
101
 
                                return String.Empty;
102
 
                        }
103
 
                        //Remove sessions id's from path
104
 
                        if (path.Length > 1)
105
 
                        {
106
 
                                path = Regex.Replace(path, @"(PHPSESSID|s|ses)=[a-z0-9]{32}(&amp;|&)*", String.Empty, RegexOptions.IgnoreCase);
107
 
                                path = path.TrimEnd(new char[] {'&', '?'});
108
 
                                if ((path.LastIndexOf("&amp;") > -1) && (path.LastIndexOf("&amp;") == (path.Length - 5)))
109
 
                                {
110
 
                                        path = path.Remove(path.LastIndexOf("&amp;"));
111
 
                                }
112
 
                        }
113
 
                        url = host + path;
114
 
                        url = System.Web.HttpUtility.UrlEncode(host + path);
115
 
                        url = url.Replace("%2f", "/");
116
 
                        url = url.Replace("%2c", ",");
117
 
                        url = url.Replace("%3d", "=");
118
 
                        url = url.Replace("%3f", "?");
119
 
                        url = url.Replace("%26", "&");
120
 
                        url = url.Replace("%3b", ";");
121
 
                        url = url.Replace("%3a", ":");
122
 
                        url = url.Replace("%40", "@");
123
 
                        url = url.Replace("%23", "#");
124
 
                        url = url.Replace("%7e", "~");
125
 
                        url = url.Replace("%252f", "/");
126
 
                        return url;
127
 
                }
128
 
                
129
 
                /// <summary>
130
 
                /// Function return SHA1 hash of URL
131
 
                /// </summary>
132
 
                /// <param name="url">
133
 
                /// A <see cref="System.String"/> URL to hash
134
 
                /// </param>
135
 
                /// <returns>
136
 
                /// A <see cref="System.String"/> SHA1 hash of URL
137
 
                /// </returns>
138
 
                public string GetHash(string url)
139
 
                {
140
 
                        if (this.disposed)
141
 
                        {
142
 
                                throw new ObjectDisposedException(GetType().Name);
143
 
                        }
144
 
                        if (this.key.Length > 0)
145
 
                        {
146
 
                                this.key.Remove(0, key.Length);
147
 
                        }
148
 
                        this.PureHash = Encoding.UTF8.GetBytes(url);
149
 
                        this.HashValue = SHhash.ComputeHash(this.PureHash);
150
 
                        foreach(byte b in this.HashValue) 
151
 
                        {
152
 
                                this.key.Append(String.Format("{0:x2}", b));
153
 
                        }
154
 
                        return this.key.ToString();
155
 
                }
156
 
                
157
 
                /// <summary>
158
 
                /// Function dispose unmanaged resources.
159
 
                /// </summary>
160
 
                public void Dispose()
161
 
                {
162
 
                        this.Dispose(true);
163
 
                }
164
 
                
165
 
                /// <summary>
166
 
                /// Function dispose unmanaged resources. Private function.
167
 
                /// </summary>
168
 
                /// <param name="disposing">
169
 
                /// A <see cref="System.Boolean"/> if true, dispose all unmanaged resources.
170
 
                /// </param>
171
 
                void Dispose(bool disposing)
172
 
                {
173
 
                        if (!this.disposed)
174
 
                        {
175
 
                                this.SHhash = null;
176
 
                                this.HashValue = null;
177
 
                                this.PureHash = null;
178
 
                                this.disposed = true;
179
 
                                disposing = true;
180
 
                        }
181
 
                }
182
 
        }
183
 
}