Rob van der Woude's Scripting Pages
Powered by GeSHi

Source code for sitemap.cs

(view source code of sitemap.cs as plain text)

  1. using System;
  2. using System.Collections.Generic;
  3. using System.Diagnostics;
  4. using System.IO;
  5. using System.Linq;
  6. using System.Reflection;
  7. using System.Text.RegularExpressions;
  8.  
  9.  
  10. namespace RobvanderWoude
  11. {
  12. 	class SiteMap
  13. 	{
  14. 		static string progver = "1.03";
  15. 		static string phpexe = "php.exe";
  16.  
  17.  
  18. 		static int Main( string[] args )
  19. 		{
  20. 			bool usefilefilter = false;
  21. 			bool usephp = false;
  22. 			bool userobots = true;
  23. 			bool userooturl = false;
  24. 			bool usewhatsnew = false;
  25. 			bool useworkingdir = false;
  26. 			bool verbose = true;
  27. 			List<string> phpfiles = new List<string>( );
  28. 			string progfile = Assembly.GetEntryAssembly( ).Location;
  29. 			string progdir = Directory.GetParent( progfile ).Name;
  30. 			string filefilter = "*.html *.php";
  31. 			string phpfilter = "*.php";
  32. 			string startdir = Directory.GetCurrentDirectory( ); // Program will return to this directory when done
  33. 			string workingdir = startdir;                       // Default working directory is the current directory
  34. 			string rooturl = String.Empty;
  35. 			string whatsnew = "whatsnew.*";
  36.  
  37. 			if ( args.Length == 0 || args.Length > 7 )
  38. 			{
  39. 				return ShowHelp( );
  40. 			}
  41.  
  42. 			foreach ( string arg in args )
  43. 			{
  44. 				if ( arg == "/?" || arg.Length < 2 )
  45. 				{
  46. 					return ShowHelp( );
  47. 				}
  48. 				if ( arg.Substring( 0, 2 ).ToUpper( ) == "/I" )
  49. 				{
  50. 					if ( !userobots )
  51. 					{
  52. 						return ShowHelp( "Duplicate command line switch /I" );
  53. 					}
  54. 					userobots = false;
  55. 				}
  56. 				else if ( arg.Substring( 0, 2 ).ToUpper( ) == "/P" )
  57. 				{
  58. 					if ( usephp )
  59. 					{
  60. 						return ShowHelp( "Duplicate command line switch /P" );
  61. 					}
  62. 					usephp = true;
  63. 					if ( arg.Length > 3 && arg[2] == ':' )
  64. 					{
  65. 						phpfilter = arg.Substring( 3 );
  66. 					}
  67. 					else
  68. 					{
  69. 						phpfilter = "*.php";
  70. 					}
  71. 				}
  72. 				else if ( arg.ToUpper( ) == "/Q" )
  73. 				{
  74. 					if ( !verbose )
  75. 					{
  76. 						return ShowHelp( "Duplicate command line switch /Q" );
  77. 					}
  78. 					verbose = false;
  79. 				}
  80. 				else if ( arg.Substring( 0, 2 ).ToUpper( ) == "/W" )
  81. 				{
  82. 					if ( usewhatsnew )
  83. 					{
  84. 						return ShowHelp( "Duplicate command line switch /W" );
  85. 					}
  86. 					usewhatsnew = true;
  87. 					if ( arg.Length > 3 && arg[2] == ':' )
  88. 					{
  89. 						whatsnew = arg.Substring( 3 );
  90. 					}
  91. 					if ( Directory.GetFiles( workingdir, whatsnew ).Length > 0 )
  92. 					{
  93. 						whatsnew = Path.GetFileNameWithoutExtension( whatsnew );
  94. 					}
  95. 					else
  96. 					{
  97. 						return ShowHelp( "WhatsNew file not found: \"{0}\"", whatsnew );
  98. 					}
  99. 				}
  100. 				else if ( arg.IndexOf( '*' ) == 0 )
  101. 				{
  102. 					if ( usefilefilter )
  103. 					{
  104. 						return ShowHelp( "Duplicate file filters: \"{0}\" and \"{1}\"", filefilter, arg );
  105. 					}
  106. 					filefilter = arg;
  107. 					usefilefilter = true;
  108. 				}
  109. 				else
  110. 				{
  111. 					if ( Directory.Exists( arg ) )
  112. 					{
  113. 						if ( useworkingdir )
  114. 						{
  115. 							return ShowHelp( "Duplicate working directories: \"{0}\" and \"{1}\"", workingdir, arg );
  116. 						}
  117. 						workingdir = arg;
  118. 						useworkingdir = true;
  119. 					}
  120. 					else if ( arg.IndexOf( "http://" ) == 0 || arg.IndexOf( "https://" ) == 0 )
  121. 					{
  122. 						if ( userooturl )
  123. 						{
  124. 							return ShowHelp( "Duplicate domain prefixes: \"{0}\" and \"{1}\"", rooturl, arg );
  125. 						}
  126. 						rooturl = arg;
  127. 						userooturl = true;
  128. 					}
  129. 					else
  130. 					{
  131. 						if ( arg.IndexOf( ":\\" ) > -1 )
  132. 						{
  133. 							return ShowHelp( "Invalid working directory: \"{0}\"", arg );
  134. 						}
  135. 						else
  136. 						{
  137. 							return ShowHelp( "Invalid command line argument: \"{0}\"", arg );
  138. 						}
  139. 					}
  140. 				}
  141. 			}
  142.  
  143. 			// Domain prefix is a mandatory command line argument
  144. 			if ( String.IsNullOrEmpty( rooturl ) )
  145. 			{
  146. 				return ShowHelp( "Please specify a domain prefix" );
  147. 			}
  148.  
  149. 			// Go to the specified working directory (required for PHP includes)
  150. 			Directory.SetCurrentDirectory( workingdir );
  151.  
  152. 			string excludefile = Path.Combine( workingdir, "sitemap.exclude" );
  153. 			string robotsfile = Path.Combine( workingdir, "robots.txt" );
  154. 			string sitemapfile = Path.Combine( workingdir, "sitemap.xml" );
  155.  
  156. 			// Find the location of PHP.EXE in case /P switch is used
  157. 			if ( usephp )
  158. 			{
  159. 				if ( File.Exists( Path.Combine( workingdir, "php.exe" ) ) )
  160. 				{
  161. 					phpexe = Path.Combine( workingdir, "php.exe" );
  162. 				}
  163. 				else
  164. 				{
  165. 					foreach ( string folder in Environment.ExpandEnvironmentVariables( "%PATH%" ).Split( ";".ToCharArray( ) ) )
  166. 					{
  167. 						if ( phpexe == "php.exe" && File.Exists( Path.Combine( folder, "php.exe" ) ) )
  168. 						{
  169. 							phpexe = Path.Combine( folder, "php.exe" );
  170. 						}
  171. 					}
  172. 					if ( phpexe == "php.exe" )
  173. 					{
  174. 						return ShowHelp( "PHP.EXE not found in %PATH%" );
  175. 					}
  176. 				}
  177. 			}
  178.  
  179. 			// List all files matching filespec
  180. 			Dictionary<string, string> allfiles = new Dictionary<string, string>( );
  181. 			foreach ( string file in Directory.GetFiles( workingdir, filefilter ) )
  182. 			{
  183. 				allfiles.Add( Path.GetFileName( file ), String.Empty );
  184. 			}
  185. 			if ( allfiles.Count == 0 )
  186. 			{
  187. 				return ShowHelp( "No matching files found for \"{0}\"", filefilter );
  188. 			}
  189.  
  190. 			if ( usephp )
  191. 			{
  192. 				phpfiles = Directory.GetFiles( workingdir, phpfilter ).ToList<string>( );
  193. 			}
  194.  
  195. 			// List all files to be excluded
  196. 			List<string> excludedfiles = new List<string>( );
  197. 			if ( File.Exists( excludefile ) )
  198. 			{
  199. 				foreach ( string line in File.ReadLines( excludefile ).ToList<string>( ) )
  200. 				{
  201. 					foreach ( string file in Directory.GetFiles( workingdir, line ) )
  202. 					{
  203. 						excludedfiles.Add( Path.GetFileName( file ) );
  204. 					}
  205. 				}
  206. 			}
  207. 			if ( userobots && File.Exists( robotsfile ) )
  208. 			{
  209. 				string pattern = @"^\s*Disallow\s*:\s*/([^\n\r]+[^\n\r/])$";
  210. 				Regex regex = new Regex( pattern, RegexOptions.IgnoreCase );
  211. 				foreach ( string line in File.ReadLines( robotsfile ).ToList<string>( ) )
  212. 				{
  213. 					if ( regex.IsMatch( line ) )
  214. 					{
  215. 						Match match = regex.Match( line );
  216. 						string filespec = match.Groups[1].Captures[0].ToString( ).Replace( '/', '\\' );
  217. 						if ( !Directory.Exists( Path.Combine( workingdir, filespec ) ) )
  218. 						{
  219. 							try
  220. 							{
  221. 								foreach ( string file in Directory.GetFiles( workingdir, filespec ) )
  222. 								{
  223. 									if ( !excludedfiles.Contains( Path.GetFileName( file ) ) )
  224. 									{
  225. 										excludedfiles.Add( Path.GetFileName( file ) );
  226. 									}
  227. 								}
  228. 							}
  229. 							catch
  230. 							{
  231. 							}
  232. 						}
  233. 					}
  234. 				}
  235. 			}
  236.  
  237. 			// Determine lastmod for each file in list
  238. 			string[] allfilenames = allfiles.Keys.ToArray<string>( );
  239. 			foreach ( string file in allfilenames )
  240. 			{
  241. 				string filename = Path.GetFileName( file );
  242. 				if ( excludedfiles.Contains( filename ) )
  243. 				{
  244. 					// Remove files to be excluded from files list
  245. 					allfiles.Remove( filename );
  246. 				}
  247. 				else
  248. 				{
  249. 					string fullpath = Path.Combine( workingdir, file );
  250. 					if ( usephp && phpfiles.Contains( fullpath ) )
  251. 					{
  252. 						// Use PHP to generate content, then extract lastmod from generated content
  253. 						allfiles[filename] = PHPRender( fullpath );
  254. 					}
  255. 					else
  256. 					{
  257. 						// Determine file's last modified date
  258. 						int year = File.GetLastWriteTime( fullpath ).Date.Year;
  259. 						int month = File.GetLastWriteTime( fullpath ).Date.Month;
  260. 						int day = File.GetLastWriteTime( fullpath ).Date.Day;
  261. 						string lastmod = String.Format( "{0:0000}-{1:00}-{2:00}", year, month, day );
  262. 						allfiles[filename] = lastmod;
  263. 					}
  264. 				}
  265. 			}
  266.  
  267. 			// Quick and dirty: write list to XML
  268. 			string xml = "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n";
  269. 			foreach ( string file in allfiles.Keys )
  270. 			{
  271. 				string filename = file;
  272. 				string lastmod = allfiles[file];
  273. 				if ( usewhatsnew && Path.GetFileNameWithoutExtension( file ) == whatsnew )
  274. 				{
  275. 					// whatsnew.* gets the timestamp of the last modified file
  276. 					lastmod = allfiles.Values.Max<string>( );
  277. 				}
  278. 				if ( verbose )
  279. 				{
  280. 					Console.WriteLine( "{0}\t{1}", lastmod, file );
  281. 				}
  282. 				if ( Path.GetFileNameWithoutExtension( file ) == "index" )
  283. 				{
  284. 					filename = String.Empty;
  285. 				}
  286. 				xml += String.Format( "  <url>\n    <loc>{0}{1}</loc>\n    <lastmod>{2}</lastmod>\n  </url>\n", rooturl, filename, lastmod );
  287. 			}
  288. 			xml += "</urlset>";
  289.  
  290. 			// Write XML to sitemap file
  291. 			File.WriteAllText( sitemapfile, xml );
  292.  
  293. 			// Go back to the original starting directory
  294. 			Directory.SetCurrentDirectory( startdir );
  295.  
  296. 			if ( verbose )
  297. 			{
  298. 				Console.WriteLine( "\nHandled {0} files", allfiles.Count );
  299. 			}
  300. 			return 0;
  301. 		}
  302.  
  303.  
  304. 		static string PHPRender( string file )
  305. 		{
  306. 			DateTime filetime = File.GetLastWriteTime( file );
  307. 			string lastmod = String.Format( "{0:0000}-{1:00}-{2:00}", filetime.Year, filetime.Month, filetime.Day );
  308. 			string phptext = String.Empty;
  309.  
  310. 			// Use PHP to render content
  311. 			ProcessStartInfo phpproc = new ProcessStartInfo( );
  312. 			phpproc.UseShellExecute = false;
  313. 			phpproc.CreateNoWindow = true;
  314. 			phpproc.RedirectStandardOutput = true;
  315. 			phpproc.FileName = phpexe;
  316. 			phpproc.Arguments = "-f \"" + file + "\"";
  317. 			using ( Process process = Process.Start( phpproc ) )
  318. 			{
  319. 				using ( StreamReader reader = process.StandardOutput )
  320. 				{
  321. 					phptext = reader.ReadToEnd( );
  322. 				}
  323. 			}
  324.  
  325. 			// Extract last modified date from rendered content
  326. 			Regex regex = new Regex( @"[12]\d\d\d-[01]\d-[0-3]\d" );
  327. 			if ( regex.IsMatch( phptext ) )
  328. 			{
  329. 				foreach ( Match match in regex.Matches( phptext ) )
  330. 				{
  331. 					if ( String.Compare( match.ToString( ), lastmod ) > 0 )
  332. 					{
  333. 						lastmod = match.ToString( );
  334. 					}
  335. 				}
  336. 			}
  337. 			return lastmod;
  338. 		}
  339.  
  340.  
  341. 		static int ShowHelp( params string[] errmsg )
  342. 		{
  343.  
  344. 			/*
  345. 			SiteMap,  Version 1.01
  346. 			Create a Google sitemap for your website source directory
  347.  
  348. 			Usage:    SITEMAP.EXE   domain  [ workingdir ]  [ filespec ]  [ options ]
  349.  
  350. 			Where:    "domain"      the domain prefix to be added, including protocol and
  351. 			                        trailing forward slash, e.g. "http://www.example.com/"
  352. 			          "workingdir"  the source files' location (default: current directory)
  353. 			          "filespec"    the source file filter (default: "*.html *.php")
  354. 			Options:  /I            Ignore "robots.txt" (see Notes below)
  355. 			          /P[:filter]   use PHP to generate file content for files matching
  356. 			                        "filter", then search the generated content for the
  357. 			                        latest date in yyyy-mm-dd format
  358. 			          /Q            Quiet mode: do not display matching file names
  359. 			          /W[:file]     specify a "What's new" file which will be listed
  360. 			                        with the timestamp of the last modified file
  361. 			                        (default file name: "whatsnew.*")
  362.  
  363. 			Notes:    To use the /P switch, PHP.EXE must be found in the PATH.
  364. 			          If no "filter" is specified with the /P switch, the *.php part of
  365. 			          "filespec" will be used (if "filespec" isn't specified either,
  366. 			          its default value "*.php" is used).
  367. 			          The program looks for a list of excluded files in an optional
  368. 			          file named "sitemap.exclude", and for "disallowed" files in
  369. 			          "robots.txt", both located in the working directory. Use /I
  370. 			          to completely ignore "robots.txt".
  371.  
  372. 			Written by Rob van der Woude
  373. 			http://www.robvanderwoude.com
  374. 			*/
  375.  
  376. 			if ( errmsg.Length > 0 )
  377. 			{
  378. 				List<string> errargs = new List<string>( errmsg );
  379. 				errargs.RemoveAt( 0 );
  380. 				Console.Error.WriteLine( );
  381. 				Console.ForegroundColor = ConsoleColor.Red;
  382. 				Console.Error.Write( "ERROR:\t" );
  383. 				Console.ForegroundColor = ConsoleColor.White;
  384. 				Console.Error.WriteLine( errmsg[0], errargs.ToArray( ) );
  385. 				Console.ResetColor( );
  386. 			}
  387.  
  388. 			Console.Error.WriteLine( );
  389.  
  390. 			Console.Error.WriteLine( "SiteMap,  Version {0}", progver );
  391.  
  392. 			Console.Error.WriteLine( "Create a Google sitemap for your website source directory" );
  393.  
  394. 			Console.Error.WriteLine( );
  395.  
  396. 			Console.Error.Write( "Usage:    " );
  397. 			Console.ForegroundColor = ConsoleColor.White;
  398. 			Console.Error.WriteLine( "SITEMAP.EXE   domain  [ workingdir ]  [ filespec ]  [ options ]" );
  399. 			Console.ResetColor( );
  400.  
  401. 			Console.Error.WriteLine( );
  402.  
  403. 			Console.Error.Write( "Where:    " );
  404. 			Console.ForegroundColor = ConsoleColor.White;
  405. 			Console.Error.Write( "\"domain\"" );
  406. 			Console.ResetColor( );
  407. 			Console.Error.WriteLine( "      the domain prefix to be added, including protocol and" );
  408.  
  409. 			Console.Error.WriteLine( "                        trailing forward slash, e.g. \"http://www.example.com/\"" );
  410.  
  411. 			Console.ForegroundColor = ConsoleColor.White;
  412. 			Console.Error.Write( "          \"workingdir\"" );
  413. 			Console.ResetColor( );
  414. 			Console.Error.WriteLine( "  the source files' location (default: current directory)" );
  415.  
  416. 			Console.ForegroundColor = ConsoleColor.White;
  417. 			Console.Error.Write( "          \"filespec\"" );
  418. 			Console.ResetColor( );
  419. 			Console.Error.WriteLine( "    the source file filter (default: \"*.html *.php\")" );
  420.  
  421. 			Console.Error.Write( "Options:  " );
  422. 			Console.ForegroundColor = ConsoleColor.White;
  423. 			Console.Error.Write( "/I            I" );
  424. 			Console.ResetColor( );
  425. 			Console.Error.WriteLine( "gnore \"robots.txt\" (see Notes below)" );
  426.  
  427. 			Console.Error.Write( "          /P[:filter]" );
  428. 			Console.ResetColor( );
  429. 			Console.Error.Write( "   use " );
  430. 			Console.ForegroundColor = ConsoleColor.White;
  431. 			Console.Error.Write( "P" );
  432. 			Console.ResetColor( );
  433. 			Console.Error.WriteLine( "HP to generate file content for files matching " );
  434.  
  435. 			Console.ForegroundColor = ConsoleColor.White;
  436. 			Console.Error.Write( "                        \"filter\"" );
  437. 			Console.ResetColor( );
  438. 			Console.Error.WriteLine( ", then search the generated content for the" );
  439.  
  440. 			Console.Error.WriteLine( "                        latest date in yyyy-mm-dd format" );
  441.  
  442. 			Console.ForegroundColor = ConsoleColor.White;
  443. 			Console.Error.Write( "          /Q            Q" );
  444. 			Console.ResetColor( );
  445. 			Console.Error.WriteLine( "uiet mode: do not display matching file names" );
  446.  
  447. 			Console.ForegroundColor = ConsoleColor.White;
  448. 			Console.Error.Write( "          /W[:file]" );
  449. 			Console.ResetColor( );
  450. 			Console.Error.Write( "     specify a \"" );
  451. 			Console.ForegroundColor = ConsoleColor.White;
  452. 			Console.Error.Write( "W" );
  453. 			Console.ResetColor( );
  454. 			Console.Error.Write( "hat's new\" " );
  455. 			Console.ForegroundColor = ConsoleColor.White;
  456. 			Console.Error.Write( "file" );
  457. 			Console.ResetColor( );
  458. 			Console.Error.WriteLine( " which will be listed" );
  459.  
  460. 			Console.Error.WriteLine( "                        with the timestamp of the last modified file" );
  461.  
  462. 			Console.Error.Write( "                        (default " );
  463. 			Console.ForegroundColor = ConsoleColor.White;
  464. 			Console.Error.Write( "file" );
  465. 			Console.ResetColor( );
  466. 			Console.Error.WriteLine( " name: \"whatsnew.*\")" );
  467.  
  468. 			Console.Error.WriteLine( );
  469.  
  470. 			Console.Error.Write( "Notes:    To use the " );
  471. 			Console.ForegroundColor = ConsoleColor.White;
  472. 			Console.Error.Write( "/P" );
  473. 			Console.ResetColor( );
  474. 			Console.Error.WriteLine( " switch, PHP.EXE must be found in the PATH." );
  475.  
  476. 			Console.Error.Write( "          If no " );
  477. 			Console.ForegroundColor = ConsoleColor.White;
  478. 			Console.Error.Write( "\"filter\"" );
  479. 			Console.ResetColor( );
  480. 			Console.Error.Write( " is specified with the " );
  481. 			Console.ForegroundColor = ConsoleColor.White;
  482. 			Console.Error.Write( "/P" );
  483. 			Console.ResetColor( );
  484. 			Console.Error.WriteLine( " switch, the *.php part of " );
  485.  
  486. 			Console.ForegroundColor = ConsoleColor.White;
  487. 			Console.Error.Write( "          \"filespec\"" );
  488. 			Console.ResetColor( );
  489. 			Console.Error.Write( " will be used (if " );
  490. 			Console.ForegroundColor = ConsoleColor.White;
  491. 			Console.Error.Write( "\"filespec\"" );
  492. 			Console.ResetColor( );
  493. 			Console.Error.WriteLine( " isn't specified either," );
  494.  
  495. 			Console.Error.WriteLine( "          its default value \"*.php\" is used)." );
  496.  
  497. 			Console.Error.WriteLine( "          The program looks for a list of excluded files in an optional" );
  498.  
  499. 			Console.Error.WriteLine( "          file named \"sitemap.exclude\", and for \"disallowed\" files in" );
  500.  
  501. 			Console.Error.WriteLine( "          \"robots.txt\", both located in the working directory. Use /I" );
  502.  
  503. 			Console.Error.WriteLine( "          to completely ignore \"robots.txt\"." );
  504.  
  505. 			Console.Error.WriteLine( );
  506.  
  507. 			Console.Error.WriteLine( "Written by Rob van der Woude" );
  508.  
  509. 			Console.Error.WriteLine( "http://www.robvanderwoude.com" );
  510.  
  511. 			return 1;
  512. 		}
  513. 	}
  514. }
  515.  

page last uploaded: 2017-08-21, 14:26