(view source code of doc2txt.cs as plain text)
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
namespace RobvanderWoude{internal class Doc2Txt
{static string progver = "1.00";
static int Main( string[] args )
{if ( args.Length != 1 || args[0] == "/?" )
{return ShowHelp( );
}string docfile = args[0];
if ( !File.Exists( docfile ) )
{return ShowHelp( "File not found: \"{0}\"", docfile );
}if ( Path.GetExtension( docfile ).ToLower( ) != ".doc" )
{return ShowHelp( "This program can extract text from .DOC files only" );
}string doccontent = string.Empty;
try {StreamReader sr = new StreamReader( docfile, false );
doccontent = sr.ReadToEnd( ).Trim( "\n\t ".ToCharArray( ) );
sr.Close( );
}catch ( IOException )
{return ShowHelp( "Access to file \"{0}\" denied", docfile );
}if ( doccontent.Length == 0 )
{return ShowHelp( "An error occurred while trying to extract text from \"{0}\"", docfile );
}if ( doccontent.Contains( "[Content_Types]" ) )
{doccontent = doccontent.Substring( 0, doccontent.IndexOf( "[Content_Types]" ) );
}string plaintext = string.Empty;
Regex regex = new Regex( "[^\\000\\015\\367\\377]{20,}" );
MatchCollection matches = regex.Matches( doccontent );
if ( matches.Count == 0 )
{return ShowHelp( "An error occurred while trying to extract text from \"{0}\"", docfile );
}foreach ( Match match in matches )
{string matchingtext = match.Value.Trim( "\n\t ".ToCharArray( ) );
if ( Encoding.UTF8.GetByteCount( matchingtext ) == matchingtext.Length && !matchingtext.Contains( (char)4 ) )
{plaintext += matchingtext + "\n";
} }Console.WriteLine( plaintext );
return 0;
}static int ShowHelp( params string[] errmsg )
{ #region Error Messageif ( errmsg.Length > 0 )
{List<string> errargs = new List<string>( errmsg );
errargs.RemoveAt( 0 );
Console.Error.WriteLine( );
Console.ForegroundColor = ConsoleColor.Red;
Console.Error.Write( "ERROR:\t" );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.WriteLine( errmsg[0], errargs.ToArray( ) );
Console.ResetColor( );
} #endregion Error Message #region Help Text /* Doc2Txt.exe, Version 1.00 Return the plain text content of a Word .DOC file without requiring Word Usage: Doc2Txt.exe docxfile Where: docfile is the path of the Word file to be read (no wildcards, only .doc extension allowed) Note: Return code ("errorlevel") 1 in case of errors, 0 on success. Written by Rob van der Woude https://www.robvanderwoude.com */ #endregion Help Text #region Display Help TextConsole.Error.WriteLine( );
Console.Error.WriteLine( "Doc2Txt.exe, Version {0}", progver );
Console.Error.WriteLine( "Return the plain text content of a Word .DOC file without requiring Word" );
Console.Error.WriteLine( );
Console.Error.Write( "Usage: " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.WriteLine( "Doc2Txt.exe docfile" );
Console.ResetColor( );
Console.Error.WriteLine( );
Console.Error.Write( "Where: " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( "docfile" );
Console.ResetColor( );
Console.Error.WriteLine( " is the path of the Word file to be read" );
Console.Error.WriteLine( " (no wildcards, only .doc extension allowed)" );
Console.Error.WriteLine( );
Console.Error.WriteLine( "Note: Return code (\"errorlevel\") 1 in case of errors, 0 on success." );
Console.Error.WriteLine( );
Console.Error.WriteLine( "Written by Rob van der Woude" );
Console.Error.WriteLine( "https://www.robvanderwoude.com" );
#endregion Display Help Textreturn 1;
} }}page last modified: 2025-10-11; loaded in 0.0077 seconds