(view source code of word2txt.cs as plain text)
using System;
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Windows.Forms;
using Word = Microsoft.Office.Interop.Word;
namespace RobvanderWoude
{
internal class Word2Txt
{
static string progver = "1.05";
static string plaintext = string.Empty;
static int Main( string[] args )
{
int rc = 0;
string document = string.Empty;
bool success = false;
bool usexmlencoding = false;
string xmlencoding = string.Empty;
Encoding encoding = null;
#region Parse Command Line
if ( args.Length == 0 || args.Length > 2 )
{
return ShowHelp( );
}
foreach ( string arg in args )
{
if ( arg[0] == '/' )
{
if ( arg == "/?" )
{
return ShowHelp( );
}
else if ( arg.StartsWith( "/D", StringComparison.OrdinalIgnoreCase ) )
{
usexmlencoding = true;
}
else if ( arg.ToUpper( ) == "/E" )
{
return ListEncodings( );
}
else
{
return ShowHelp( "Invalid command line switch {0}", arg );
}
}
else
{
if ( string.IsNullOrWhiteSpace( document ) )
{
document = arg;
if ( !File.Exists( document ) )
{
return ShowHelp( "File \"{0}\" not found", document );
}
}
else if ( encoding == null )
{
encoding = GetEncoding( arg );
if ( encoding == null )
{
return ShowHelp( "Invalid encoding \"{0}\"", args[1] );
}
}
else
{
return ShowHelp( "Too many command line arguments" );
}
}
}
if ( string.IsNullOrWhiteSpace( document ) )
{
return ShowHelp( );
}
#endregion Parse Command Line
#region Extract Text
// First try using Word if possible
if ( IsWordInstalled( ) )
{
// If Word is installed, this program can handle ANY document format that is recognized by Word
success = ReadWordFile( document );
}
// if Word isn't available or could not extract any text, try plan B
if ( !success || string.IsNullOrWhiteSpace( plaintext ) )
{
rc = 1;
string ext = Path.GetExtension( document ).ToLower( );
if ( ext == ".doc" )
{
success = ReadDocFile( document );
}
else if ( ext == ".docx" || ext == ".odt" )
{
success = ReadDocxOrOdtFile( document );
}
else if ( ext == ".rtf" )
{
success = ReadRTFFile( document );
}
else if ( ext == ".wpd" )
{
success = ReadWPDFile( document );
}
else
{
return ShowHelp( "If Word is not installed or fails to extract text, this program can only handle .DOC, .DOCX, .ODT and .WPD files" );
}
}
#endregion Extract Text
#region Cleanup Text and Display Result
if ( success && !string.IsNullOrWhiteSpace( plaintext ) )
{
// convert stray carriage returns to carriage return/linefeed pairs
plaintext = ConvertStrayCarriageReturns( plaintext ).Trim( "\n\r\t ".ToCharArray( ) );
if ( usexmlencoding )
{
encoding = GetEncoding( xmlencoding );
}
if ( encoding == null )
{
// send text to console using default output encoding
Console.WriteLine( plaintext );
}
else
{
// temporarily change output encoding and send text to console
Encoding oldencoding = Console.OutputEncoding;
Console.OutputEncoding = encoding;
Console.WriteLine( plaintext );
Console.OutputEncoding = oldencoding;
}
}
else
{
rc = 2;
}
#endregion Cleanup Text and Display Result
return rc;
}
static string ConvertStrayCarriageReturns( string text )
{
// convert stray carriage returns to carriage return/linefeed pairs
// search for stray carriage returns (\r), i.e. the ones NOT followed by linefeeds (\n)
Regex regex = new Regex( "\r(?!\n)" );
// replace each matching stray carriage return by a carriage return/linefeed pair
text = regex.Replace( text, Environment.NewLine );
return text;
}
static Encoding GetEncoding( string myencoding )
{
if ( string.IsNullOrEmpty( myencoding ) )
{
return null;
}
// Get a list of available encodings
EncodingInfo[] encodings = Encoding.GetEncodings( );
// Try correctly spelled encodings first
foreach ( EncodingInfo encoding in encodings )
{
if ( encoding.Name.ToLower( ) == myencoding.ToLower( ) )
{
return Encoding.GetEncoding( encoding.CodePage );
}
}
// No direct match found, try again, ignoring dashes
foreach ( EncodingInfo encoding in encodings )
{
if ( encoding.Name.Replace( "-", "" ).ToLower( ) == myencoding.Replace( "-", "" ).ToLower( ) )
{
return Encoding.GetEncoding( encoding.CodePage );
}
}
// Still no match, try codepages
foreach ( EncodingInfo encoding in encodings )
{
if ( encoding.CodePage.ToString( ) == myencoding )
{
return Encoding.GetEncoding( encoding.CodePage );
}
}
// Still no match, giving up
return null;
}
static bool IsWordInstalled( )
{
// Source: "How to Check Whether Word is Installed in the System or Not" by Tadit Dash
// https://www.codeproject.com/Tips/689968/How-to-Check-Whether-Word-is-Installed-in-the-Syst
return ( Type.GetTypeFromProgID( "Word.Application" ) != null );
}
static int ListEncodings( )
{
try
{
Console.Clear( );
}
catch
{
// Console.Clear( ) throws an IO exception if the output is redirected
}
int columnwidth = 8;
EncodingInfo[] allencodings = Encoding.GetEncodings( );
List<string> allencodingnames = new List<string>( );
foreach ( EncodingInfo enc in allencodings )
{
allencodingnames.Add( enc.Name );
}
allencodingnames.Sort( );
foreach ( string enc in allencodingnames )
{
columnwidth = Math.Max( columnwidth, enc.Length );
}
Console.WriteLine( "{0,-" + columnwidth + "} {1}", "Encoding", "CodePage" );
Console.WriteLine( "{0,-" + columnwidth + "} {1}", "========", "========" );
foreach ( string enc in allencodingnames )
{
Console.WriteLine( "{0,-" + columnwidth + "} {1}", enc, GetEncoding( enc ).CodePage );
}
return 0;
}
static bool ReadDocFile( string docfile )
{
string doccontent = string.Empty;
try
{
StreamReader sr = new StreamReader( docfile, false );
doccontent = sr.ReadToEnd( ).Trim( "\n\t ".ToCharArray( ) );
sr.Close( );
}
catch ( IOException )
{
ShowHelp( "Access to file \"{0}\" denied", docfile );
return false;
}
if ( doccontent.Length == 0 )
{
return false;
}
if ( doccontent.Contains( "[Content_Types]" ) )
{
doccontent = doccontent.Substring( 0, doccontent.IndexOf( "[Content_Types]" ) );
}
Regex regex = new Regex( "[^\\000\\015\\367\\377]{20,}" );
MatchCollection matches = regex.Matches( doccontent );
if ( matches.Count == 0 )
{
return false;
}
plaintext = string.Empty;
foreach ( Match match in matches )
{
string matchingtext = match.Value.Trim( "\n\t ".ToCharArray( ) );
if ( Encoding.UTF8.GetByteCount( matchingtext ) == matchingtext.Length && !matchingtext.Contains( (char)4 ) )
{
plaintext += matchingtext + "\n";
}
}
return true;
}
static bool ReadDocxOrOdtFile( string docfile )
{
string contentfile;
string ext = Path.GetExtension( docfile ).ToLower( );
if ( ext == ".odt" ) // OpenOffice document
{
contentfile = "content.xml";
}
else if ( ext == ".docx" ) // MS Office document
{
contentfile = "document.xml";
}
else
{
return false;
}
string tempfile = Path.GetTempFileName( );
string content = string.Empty;
bool success = false;
try
{
// Open document as ZIP file and extract the XML file containing the text content
using ( ZipArchive archive = ZipFile.OpenRead( docfile ) )
{
foreach ( ZipArchiveEntry entry in archive.Entries )
{
if ( entry.Name.ToLower( ) == contentfile )
{
entry.ExtractToFile( tempfile, true );
success = true;
}
}
}
}
catch ( IOException )
{
ShowHelp( "Access to file \"{0}\" denied", docfile );
return false;
}
if ( success )
{
// Read the text content from the extracted file
StreamReader sr = new StreamReader( tempfile );
content = sr.ReadToEnd( ).Trim( "\n\r\t ".ToCharArray( ) );
sr.Close( );
}
// Delete the extracted file
File.Delete( tempfile );
if ( success )
{
// The first 100 characters of the extracted XML usually contain its encoding;
// this encoding will be used if the /D command line switch was used
Regex regex = new Regex( " encoding=\"([^\"]+)\"" );
string xmlencoding = regex.Match( content, 0, 100 ).Groups[1].Value;
// insert newlines after headers, list items and paragraphs
regex = new Regex( "</(text|w):(h|p)>" );
plaintext = regex.Replace( content, "\n\n" );
regex = new Regex( "<w:br/>" );
plaintext = regex.Replace( plaintext, "\n\n" );
// remove all XML tags
regex = new Regex( "<[^>]+>" );
plaintext = regex.Replace( plaintext, "" );
}
return success;
}
static bool ReadRTFFile( string rtffile )
{
// Use a hidden RichTextBox to convert RTF to plain text, by Wendy Zang
// https://social.msdn.microsoft.com/Forums/vstudio/en-US/6e56af9b-d7d3-49f3-9ec4-80edde3fe54b/reading-modifying-rtf-files?forum=csharpgeneral#a64345e9-cfcb-43be-ab18-c08fae02cb2a
RichTextBox rtbox = new RichTextBox( );
string rtftext = string.Empty;
try
{
rtftext = File.ReadAllText( rtffile );
rtbox.Rtf = rtftext;
plaintext = rtbox.Text;
}
catch ( IOException )
{
return false;
}
return true;
}
static bool ReadWordFile( string wordfile )
{
Word.Application wordapp = new Word.Application( );
object savechanges = Word.WdSaveOptions.wdDoNotSaveChanges;
bool success = false;
try
{
wordapp.Visible = false;
Word.Document worddoc = wordapp.Documents.Open( wordfile );
wordapp.Selection.WholeStory( );
plaintext = worddoc.Content.Text;
worddoc.Close( ref savechanges );
success = true;
}
catch ( Exception )
{
success = false;
}
finally
{
wordapp.Quit( ref savechanges );
}
return success;
}
static bool ReadWPDFile( string wpfile )
{
string wpcontent = File.ReadAllText( wpfile, Encoding.UTF8 );
// Remove (most of) the WPD file header - WARNING: regex pattern depends on Encoding used for StreamReader!
Regex regex = new Regex( "^[\\w\\W]*\\000{8,}([^\\w]+[B-HJ-NP-TV-Z\\d])*[^\\w-]+", RegexOptions.IgnoreCase );
wpcontent = regex.Replace( wpcontent, "" );
plaintext = string.Empty;
// WPD file format info based on http://justsolve.archiveteam.org/wiki/WordPerfect
// Modified for spaces, linefeeds and e acute by yours truly
// More modifications are required for accented characters
bool skip = false;
int resume = -1;
foreach ( char c in wpcontent )
{
int i = (int)c;
if ( !skip )
{
if ( i == 63 || i == 128 || i == 160 || i == 65533 )
{
plaintext += ' ';
}
else if ( i >= 169 && i != 172 && i <= 174 )
{
plaintext += '-';
}
else if ( i == 10 || i == 13 || i == 208 )
{
plaintext += Environment.NewLine;
}
else if ( i >= 192 && i <= 236 )
{
skip = true;
resume = i;
}
else if ( i == 15 )
{
plaintext += (char)233;
}
else if ( i <= 31 || ( i >= 129 && i <= 159 ) || ( i >= 161 && i <= 168 ) || i == 172 || ( i >= 175 && i <= 191 ) || ( i >= 237 && i <= 255 ) )
{
// control characters, ignore
}
else
{
plaintext += c;
}
}
else if ( skip && i == resume )
{
skip = false;
resume = -1;
}
}
return !string.IsNullOrWhiteSpace( plaintext );
}
static int ShowHelp( params string[] errmsg )
{
#region Help Text
/*
Word2Txt, Version 1.05
Extract plain text from a Word document and send it to the screen
Usage: Word2Txt "wordfile" [ encoding | /D ]
or: Word2Txt /E
Where: wordfile is the path of the Word document to be read
(no wildcards allowed)
encoding force use of alternative encoding for plain
text, e.g. UTF-8 to preserve accented characters
or IBM437 to convert unicode quotes to ASCII
/D use the encoding specified in the document file
(for .DOCX and .ODT only, if Word isn't available)
/E list all available encodings
Notes: If a "regular" (MSI based) Microsoft Word (2007 or later)
installation is detected, this program will use Word to read the
text from the Word file, which may be ANY file format recognized
by Word.
If Word was already active when this program is started, any other
opened document(s) will be left alone, and only the document opened
by this program will be closed.
If Word is not available, or if it encounters unreadable content
(i.e. the file is corrupted), the text can still be extracted, but
only from .DOC, .DOCX, .ODT, .RTF and .WPD files.
If the specified encoding does not match any available encoding name,
the program will try again, ignoring dashes; if that does not provide
a match, the program will try matching the specified encoding with
the available encodings' codepages.
This program requires .NET 4.5.
Return code ("errorlevel") 0 means Word encountered no errors and
some text was extracted from the file; 1 means Word is not available
or the file was corrupted; 2 means either command line errors or the
program failed to extract any text.
Written by Rob van der Woude
https://www.robvanderwoude.com
*/
#endregion Help Text
#region Error Message
if ( errmsg.Length > 0 )
{
List<string> errargs = new List<string>( errmsg );
errargs.RemoveAt( 0 );
Console.Error.WriteLine( );
Console.ForegroundColor = ConsoleColor.Red;
Console.Error.Write( "ERROR:\t" );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.WriteLine( errmsg[0], errargs.ToArray( ) );
Console.ResetColor( );
}
#endregion Error Message
#region Display Help Text
Console.Error.WriteLine( );
Console.Error.WriteLine( "Word2Txt, Version {0}", progver );
Console.Error.WriteLine( "Extract plain text from a Word document and send it to the screen" );
Console.Error.WriteLine( );
Console.Error.Write( "Usage: " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.WriteLine( "Word2Txt \"wordfile\" [ encoding | /D ]" );
Console.ResetColor( );
Console.Error.WriteLine( );
Console.Error.Write( "or: " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.WriteLine( "Word2Txt /E" );
Console.ResetColor( );
Console.Error.WriteLine( );
Console.Error.Write( "Where: " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( "wordfile" );
Console.ResetColor( );
Console.Error.WriteLine( " is the path of the Word document to be read" );
Console.Error.WriteLine( " (no wildcards allowed)" );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( " encoding" );
Console.ResetColor( );
Console.Error.WriteLine( " force use of alternative encoding for plain" );
Console.Error.Write( " text, e.g. " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( "UTF-8" );
Console.ResetColor( );
Console.Error.WriteLine( " to preserve accented characters" );
Console.Error.Write( " or " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( "IBM437" );
Console.ResetColor( );
Console.Error.WriteLine( " to convert unicode quotes to ASCII" );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( " /D" );
Console.ResetColor( );
Console.Error.WriteLine( " use the encoding specified in the document file" );
Console.Error.WriteLine( " (for .DOCX and .ODT only, if Word isn't available)" );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( " /E" );
Console.ResetColor( );
Console.Error.WriteLine( " list all available encodings" );
Console.Error.WriteLine( );
Console.Error.WriteLine( "Notes: If a \"regular\" (MSI based) Microsoft Word (2007 or later)" );
Console.Error.WriteLine( " installation is detected, this program will use Word to read the" );
Console.Error.WriteLine( " recognized text from the Word file, which may be ANY file format" );
Console.Error.WriteLine( " by Word." );
Console.Error.WriteLine( " If Word was already active when this program is started, any other" );
Console.Error.WriteLine( " opened document(s) will be left alone, and only the document opened" );
Console.Error.WriteLine( " by this program will be closed." );
Console.Error.WriteLine( " If Word is not available, or if it encounters unreadable content" );
Console.Error.WriteLine( " (i.e. the file is corrupted), the text can still be extracted, but" );
Console.Error.WriteLine( " only from .DOC, .DOCX, .ODT, .RTF and .WPD files." );
Console.Error.WriteLine( " If the specified encoding does not match any available encoding name," );
Console.Error.WriteLine( " the program will try again, ignoring dashes; if that does not provide" );
Console.Error.WriteLine( " a match, the program will try matching the specified encoding with" );
Console.Error.WriteLine( " the available encodings' codepages." );
Console.Error.WriteLine( " This program requires .NET 4.5." );
Console.Error.WriteLine( " Return code (\"errorlevel\") 0 means Word encountered no errors and" );
Console.Error.WriteLine( " some text was extracted from the file; 1 means Word is not available" );
Console.Error.WriteLine( " or the file was corrupted; 2 means either command line errors or the" );
Console.Error.WriteLine( " program failed to extract any text." );
Console.Error.WriteLine( );
Console.Error.WriteLine( "Written by Rob van der Woude" );
Console.Error.WriteLine( "https://www.robvanderwoude.com" );
#endregion Display Help Text
return 2;
}
}
}
page last modified: 2024-04-16; loaded in 0.0130 seconds