Main Page | Class Hierarchy | Class List | File List | Class Members

ImageUriParser.cs

00001 using System;
00002 using System.Text;
00003 using System.Collections.Specialized;
00004 using System.Text.RegularExpressions;
00005 
00006 namespace Common
00007 {
00011         public class ImageUriParser
00012         {
00013 
00014                 public ImageUriParser() {}
00015 
00022                 public static HTTPRequest[] GenerateImageRequests(MessageLogger msgLog, HTTPRequest originalRequest, HTTPResponse htmlResponse){
00023                         Uri[] imageUris = GetImageUris(msgLog, originalRequest.URI, htmlResponse.Body.data);
00024                         HTTPRequest [] requests = new HTTPRequest[imageUris.Length];
00025                         for(int i = 0; i < requests.Length; i++){
00026                                 // do some processing
00027                                 try {
00028                                         if (imageUris[i] != null) {
00029                                                 requests[i] = new HTTPRequest(originalRequest, imageUris[i]);
00030                                         }
00031                                 } catch (Exception ex) {
00032                                         msgLog.LogError("Unable to create image request for {0} : {1}", imageUris[i], ex.Message);
00033                                         requests[i] = null;
00034                                 }
00035                         }
00036                         return requests;
00037                 }
00038 
00045                 public static Uri[] GetImageUris(MessageLogger msgLog, Uri baseUri, byte[] htmlAsASCIIBytes) {
00046                         string htmlString = Encoding.ASCII.GetString(htmlAsASCIIBytes);
00047                         StringCollection imageStrings = new StringCollection();
00048                         Regex r;
00049                         Match m;
00050                         // The regular expression to match against
00051                         r = new Regex("<\\s*?img.*?src\\s*?=\\s*?(?:\"(?<1>[^\"]*)\"|(?<1>\\S+)).*?>", RegexOptions.IgnoreCase|RegexOptions.IgnorePatternWhitespace|RegexOptions.Compiled|RegexOptions.Singleline);
00052                         // match strings
00053                         for (m = r.Match(htmlString); m.Success; m = m.NextMatch()) {
00054                                 string s = m.Groups[1].Value;
00055                                 s.Trim(new char[] {'\n','\r','\f'}); // remove newlines and the like
00056                                 imageStrings.Add(s);
00057                         }
00058                         msgLog.Log("Parse found {0} images before duplicate removal", imageStrings.Count);
00059                         // add to unique string collection, removing duplicates
00060                         StringCollection uniqueStrings = new StringCollection();
00061                         for (int i = 0; i < imageStrings.Count; i++) {
00062                                 if (!uniqueStrings.Contains(imageStrings[i])) { 
00063                                         uniqueStrings.Add(imageStrings[i]);
00064                                 }
00065                         }
00066                         // create URIs from unique strings
00067                         Uri[] result = new Uri[uniqueStrings.Count];
00068                         for (int i = 0; i < uniqueStrings.Count; i++) {
00069                                 try {
00070                                         result[i] = new Uri(baseUri, uniqueStrings[i], true); 
00071                                         /* if imageStrings contains full URI, this will
00072                                          * ignore the base uri in 'request */
00073                                 } catch (UriFormatException) {
00074                                         msgLog.LogError("Dropping this parse and push IMG - bad URI");
00075                                 }
00076                         }
00077                         return result;
00078                 }
00079         }
00080 }

Generated on Mon May 8 22:07:27 2006 by  doxygen 1.3.9.1