using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Text.RegularExpressions; namespace RobvanderWoude { internal class Doc2Txt { static string progver = "1.00"; static int Main( string[] args ) { if ( args.Length != 1 || args[0] == "/?" ) { return ShowHelp( ); } string docfile = args[0]; if ( !File.Exists( docfile ) ) { return ShowHelp( "File not found: \"{0}\"", docfile ); } if ( Path.GetExtension( docfile ).ToLower( ) != ".doc" ) { return ShowHelp( "This program can extract text from .DOC files only" ); } string doccontent = string.Empty; try { StreamReader sr = new StreamReader( docfile, false ); doccontent = sr.ReadToEnd( ).Trim( "\n\t ".ToCharArray( ) ); sr.Close( ); } catch ( IOException ) { return ShowHelp( "Access to file \"{0}\" denied", docfile ); } if ( doccontent.Length == 0 ) { return ShowHelp( "An error occurred while trying to extract text from \"{0}\"", docfile ); } if ( doccontent.Contains( "[Content_Types]" ) ) { doccontent = doccontent.Substring( 0, doccontent.IndexOf( "[Content_Types]" ) ); } string plaintext = string.Empty; Regex regex = new Regex( "[^\\000\\015\\367\\377]{20,}" ); MatchCollection matches = regex.Matches( doccontent ); if ( matches.Count == 0 ) { return ShowHelp( "An error occurred while trying to extract text from \"{0}\"", docfile ); } foreach ( Match match in matches ) { string matchingtext = match.Value.Trim( "\n\t ".ToCharArray( ) ); if ( Encoding.UTF8.GetByteCount( matchingtext ) == matchingtext.Length && !matchingtext.Contains( (char)4 ) ) { plaintext += matchingtext + "\n"; } } Console.WriteLine( plaintext ); return 0; } static int ShowHelp( params string[] errmsg ) { #region Error Message if ( errmsg.Length > 0 ) { List errargs = new List( errmsg ); errargs.RemoveAt( 0 ); Console.Error.WriteLine( ); Console.ForegroundColor = ConsoleColor.Red; Console.Error.Write( "ERROR:\t" ); Console.ForegroundColor = ConsoleColor.White; Console.Error.WriteLine( errmsg[0], errargs.ToArray( ) ); Console.ResetColor( ); } #endregion Error Message #region Help Text /* Doc2Txt.exe, Version 1.00 Return the plain text content of a Word .DOC file without requiring Word Usage: Doc2Txt.exe docxfile Where: docfile is the path of the Word file to be read (no wildcards, only .doc extension allowed) Note: Return code ("errorlevel") 1 in case of errors, 0 on success. Written by Rob van der Woude https://www.robvanderwoude.com */ #endregion Help Text #region Display Help Text Console.Error.WriteLine( ); Console.Error.WriteLine( "Doc2Txt.exe, Version {0}", progver ); Console.Error.WriteLine( "Return the plain text content of a Word .DOC file without requiring Word" ); Console.Error.WriteLine( ); Console.Error.Write( "Usage: " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.WriteLine( "Doc2Txt.exe docfile" ); Console.ResetColor( ); Console.Error.WriteLine( ); Console.Error.Write( "Where: " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( "docfile" ); Console.ResetColor( ); Console.Error.WriteLine( " is the path of the Word file to be read" ); Console.Error.WriteLine( " (no wildcards, only .doc extension allowed)" ); Console.Error.WriteLine( ); Console.Error.WriteLine( "Note: Return code (\"errorlevel\") 1 in case of errors, 0 on success." ); Console.Error.WriteLine( ); Console.Error.WriteLine( "Written by Rob van der Woude" ); Console.Error.WriteLine( "https://www.robvanderwoude.com" ); #endregion Display Help Text return 1; } } }