Extract Embedded Image Object in RTF
Solution 1
Here is a piece of code that can extract all objects ('Package' class objects) from an RTF stream:
public static void ExtractPackageObjects(string filePath)
{
using (StreamReader sr = new StreamReader(filePath))
{
RtfReader reader = new RtfReader(sr);
IEnumerator<RtfObject> enumerator = reader.Read().GetEnumerator();
while(enumerator.MoveNext())
{
if (enumerator.Current.Text == "object")
{
if (RtfReader.MoveToNextControlWord(enumerator, "objclass"))
{
string className = RtfReader.GetNextText(enumerator);
if (className == "Package")
{
if (RtfReader.MoveToNextControlWord(enumerator, "objdata"))
{
byte[] data = RtfReader.GetNextTextAsByteArray(enumerator);
using (MemoryStream packageData = new MemoryStream())
{
RtfReader.ExtractObjectData(new MemoryStream(data), packageData);
packageData.Position = 0;
PackagedObject po = PackagedObject.Extract(packageData);
File.WriteAllBytes(po.DisplayName, po.Data);
}
}
}
}
}
}
}
}
And here are the utility classes that this code uses. There is a simple stream-based RTF parser that allows to get to the interesting control words.
There is also a utility to extract data from a serialized Object Packager instance. Object Packager is an almost 20-years ago OLE1.0 thing and the serialized binary format is not documented (to my knowledge), but it's understandable.
This works fine on your provided sample, but you may have to adapt things around.
public class RtfReader
{
public RtfReader(TextReader reader)
{
if (reader == null)
throw new ArgumentNullException("reader");
Reader = reader;
}
public TextReader Reader { get; private set; }
public IEnumerable<RtfObject> Read()
{
StringBuilder controlWord = new StringBuilder();
StringBuilder text = new StringBuilder();
Stack<RtfParseState> stack = new Stack<RtfParseState>();
RtfParseState state = RtfParseState.Group;
do
{
int i = Reader.Read();
if (i < 0)
{
if (!string.IsNullOrWhiteSpace(controlWord.ToString()))
yield return new RtfControlWord(controlWord.ToString());
if (!string.IsNullOrWhiteSpace(text.ToString()))
yield return new RtfText(text.ToString());
yield break;
}
char c = (char)i;
// noise chars
if ((c == '\r') ||
(c == '\n'))
continue;
switch (state)
{
case RtfParseState.Group:
if (c == '{')
{
stack.Push(state);
break;
}
if (c == '\\')
{
state = RtfParseState.ControlWord;
break;
}
break;
case RtfParseState.ControlWord:
if (c == '\\')
{
// another controlWord
if (!string.IsNullOrWhiteSpace(controlWord.ToString()))
{
yield return new RtfControlWord(controlWord.ToString());
controlWord.Clear();
}
break;
}
if (c == '{')
{
// a new group
state = RtfParseState.Group;
if (!string.IsNullOrWhiteSpace(controlWord.ToString()))
{
yield return new RtfControlWord(controlWord.ToString());
controlWord.Clear();
}
break;
}
if (c == '}')
{
// close group
state = stack.Count > 0 ? stack.Pop() : RtfParseState.Group;
if (!string.IsNullOrWhiteSpace(controlWord.ToString()))
{
yield return new RtfControlWord(controlWord.ToString());
controlWord.Clear();
}
break;
}
if (!Char.IsLetterOrDigit(c))
{
state = RtfParseState.Text;
text.Append(c);
if (!string.IsNullOrWhiteSpace(controlWord.ToString()))
{
yield return new RtfControlWord(controlWord.ToString());
controlWord.Clear();
}
break;
}
controlWord.Append(c);
break;
case RtfParseState.Text:
if (c == '\\')
{
state = RtfParseState.EscapedText;
break;
}
if (c == '{')
{
if (!string.IsNullOrWhiteSpace(text.ToString()))
{
yield return new RtfText(text.ToString());
text.Clear();
}
// a new group
state = RtfParseState.Group;
break;
}
if (c == '}')
{
if (!string.IsNullOrWhiteSpace(text.ToString()))
{
yield return new RtfText(text.ToString());
text.Clear();
}
// close group
state = stack.Count > 0 ? stack.Pop() : RtfParseState.Group;
break;
}
text.Append(c);
break;
case RtfParseState.EscapedText:
if ((c == '\\') || (c == '}') || (c == '{'))
{
state = RtfParseState.Text;
text.Append(c);
break;
}
// ansi character escape
if (c == '\'')
{
text.Append(FromHexa((char)Reader.Read(), (char)Reader.Read()));
break;
}
if (!string.IsNullOrWhiteSpace(text.ToString()))
{
yield return new RtfText(text.ToString());
text.Clear();
}
// in fact, it's a normal controlWord
controlWord.Append(c);
state = RtfParseState.ControlWord;
break;
}
}
while (true);
}
public static bool MoveToNextControlWord(IEnumerator<RtfObject> enumerator, string word)
{
if (enumerator == null)
throw new ArgumentNullException("enumerator");
while (enumerator.MoveNext())
{
if (enumerator.Current.Text == word)
return true;
}
return false;
}
public static string GetNextText(IEnumerator<RtfObject> enumerator)
{
if (enumerator == null)
throw new ArgumentNullException("enumerator");
while (enumerator.MoveNext())
{
RtfText text = enumerator.Current as RtfText;
if (text != null)
return text.Text;
}
return null;
}
public static byte[] GetNextTextAsByteArray(IEnumerator<RtfObject> enumerator)
{
if (enumerator == null)
throw new ArgumentNullException("enumerator");
while (enumerator.MoveNext())
{
RtfText text = enumerator.Current as RtfText;
if (text != null)
{
List<byte> bytes = new List<byte>();
for (int i = 0; i < text.Text.Length; i += 2)
{
bytes.Add((byte)FromHexa(text.Text[i], text.Text[i + 1]));
}
return bytes.ToArray();
}
}
return null;
}
// Extracts an EmbeddedObject/ObjectHeader from a stream
// see [MS -OLEDS]: Object Linking and Embedding (OLE) Data Structures for more information
// chapter 2.2: OLE1.0 Format Structures
public static void ExtractObjectData(Stream inputStream, Stream outputStream)
{
if (inputStream == null)
throw new ArgumentNullException("inputStream");
if (outputStream == null)
throw new ArgumentNullException("outputStream");
BinaryReader reader = new BinaryReader(inputStream);
reader.ReadInt32(); // OLEVersion
int formatId = reader.ReadInt32(); // FormatID
if (formatId != 2) // see 2.2.4 Object Header. 2 means EmbeddedObject
throw new NotSupportedException();
ReadLengthPrefixedAnsiString(reader); // className
ReadLengthPrefixedAnsiString(reader); // topicName
ReadLengthPrefixedAnsiString(reader); // itemName
int nativeDataSize = reader.ReadInt32();
byte[] bytes = reader.ReadBytes(nativeDataSize);
outputStream.Write(bytes, 0, bytes.Length);
}
// see chapter 2.1.4 LengthPrefixedAnsiString
private static string ReadLengthPrefixedAnsiString(BinaryReader reader)
{
int length = reader.ReadInt32();
if (length == 0)
return string.Empty;
byte[] bytes = reader.ReadBytes(length);
return Encoding.Default.GetString(bytes, 0, length - 1);
}
private enum RtfParseState
{
ControlWord,
Text,
EscapedText,
Group
}
private static char FromHexa(char hi, char lo)
{
return (char)byte.Parse(hi.ToString() + lo, NumberStyles.HexNumber);
}
}
// Utility class to parse an OLE1.0 OLEOBJECT
public class PackagedObject
{
private PackagedObject()
{
}
public string DisplayName { get; private set; }
public string IconFilePath { get; private set; }
public int IconIndex { get; private set; }
public string FilePath { get; private set; }
public byte[] Data { get; private set; }
private static string ReadAnsiString(BinaryReader reader)
{
StringBuilder sb = new StringBuilder();
do
{
byte b = reader.ReadByte();
if (b == 0)
return sb.ToString();
sb.Append((char)b);
}
while (true);
}
public static PackagedObject Extract(Stream inputStream)
{
if (inputStream == null)
throw new ArgumentNullException("inputStream");
BinaryReader reader = new BinaryReader(inputStream);
reader.ReadUInt16(); // sig
PackagedObject po = new PackagedObject();
po.DisplayName = ReadAnsiString(reader);
po.IconFilePath = ReadAnsiString(reader);
po.IconIndex = reader.ReadUInt16();
int type = reader.ReadUInt16();
if (type != 3) // 3 is file, 1 is link
throw new NotSupportedException();
reader.ReadInt32(); // nextsize
po.FilePath = ReadAnsiString(reader);
int dataSize = reader.ReadInt32();
po.Data = reader.ReadBytes(dataSize);
// note after that, there may be unicode + long path info
return po;
}
}
public class RtfObject
{
public RtfObject(string text)
{
if (text == null)
throw new ArgumentNullException("text");
Text = text.Trim();
}
public string Text { get; private set; }
}
public class RtfText : RtfObject
{
public RtfText(string text)
: base(text)
{
}
}
public class RtfControlWord : RtfObject
{
public RtfControlWord(string name)
: base(name)
{
}
}
Solution 2
OK, this should work for you. To demonstrate my solution, I created a WinForms project with a PictureBox whose paint event handler was mapped to the following function:
private void rtfImage_Paint(object sender, PaintEventArgs e)
{
string rtfStr = System.IO.File.ReadAllText("MySampleFile.rtf");
string imageDataHex = ExtractImgHex(rtfStr);
byte[] imageBuffer = ToBinary(imageDataHex);
Image image;
using (MemoryStream stream = new MemoryStream(imageBuffer))
{
image = Image.FromStream(stream);
}
Rectangle rect = new Rectangle(0, 0, 100, 100);
e.Graphics.DrawImage(image, rect);
}
This code relies the on the System.Drawing.Image.FromStream() method, along with two "helper" functions:
A string extractor:
string ExtractImgHex(string s)
{
// I'm sure you could use regex here, but this works.
// This assumes one picture per file; loops required otherwise
int pictTagIdx = s.IndexOf("{\\pict\\");
int startIndex = s.IndexOf(" ", pictTagIdx)+1;
int endIndex = s.IndexOf("}", startIndex);
return s.Substring(startIndex, endIndex - startIndex);
}
... and a binary converter:
public static byte[] ToBinary(string imageDataHex)
{
//this function taken entirely from:
// http://www.codeproject.com/Articles/27431/Writing-Your-Own-RTF-Converter
if (imageDataHex == null)
{
throw new ArgumentNullException("imageDataHex");
}
int hexDigits = imageDataHex.Length;
int dataSize = hexDigits / 2;
byte[] imageDataBinary = new byte[dataSize];
StringBuilder hex = new StringBuilder(2);
int dataPos = 0;
for (int i = 0; i < hexDigits; i++)
{
char c = imageDataHex[i];
if (char.IsWhiteSpace(c))
{
continue;
}
hex.Append(imageDataHex[i]);
if (hex.Length == 2)
{
imageDataBinary[dataPos] = byte.Parse(hex.ToString(), System.Globalization.NumberStyles.HexNumber);
dataPos++;
hex.Remove(0, 2);
}
}
return imageDataBinary;
}
Solution 3
Below code can extract all type of embedded objects. including image/docs/mails etc with original file name. And save them in a local path.
string MyDir = @"E:\temp\";
Document doc = new Document(MyDir + "Requirement#4.rtf");
NodeCollection nodeColl = doc.GetChildNodes(NodeType.Shape, true);
foreach (var node in nodeColl)
{
Shape shape1 = (Shape)node;
if (shape1.OleFormat != null)
{
shape1.OleFormat.Save(MyDir + shape1.OleFormat.SuggestedFileName + shape1.OleFormat.SuggestedExtension);
}
}
PRNDL Development Studios
Tired of looking on stackoverflow for the answers? We can write the code for you! Contact us: http://prndl.us or [email protected] !
Updated on June 04, 2022Comments
-
PRNDL Development Studios almost 2 years
I have
rtf
documents that include an embedded object (an image). I need to extract this as anImage
object (or any other usable format). I have checked out this CodeProject article but the default apps don't render it correctly (They render the 'default image' image, not the image itself), so I moved on.Here is a sample of the RTF Code (I had to shorten it because of size):
{\rtf1\ansi\deff0{\fonttbl{\f0\fnil\fcharset0 MS Sans Serif;}} \viewkind4\uc1\pard\lang1033\f0\fs18{\object\objemb{\*\objclass Package}\objw855\objh810{\*\objdata 01050000 02000000 08000000 5061636b61676500 00000000 00000000 1f900000 02007369675f5f2e6a706700433a5c55736572735c726563657074696f6e5c4465736b746f705c 5369676e6174757265735c7369675f5f2e6a7067000000030034000000433a5c55736572735c52 45434550547e315c417070446174615c4c6f63616c5c54656d705c7369675f5f20283132292e6a 706700c18e0000ffd8ffe000104a46494600010101004800470000ffdb00430001010101010101 010101010101010101010101010101010101010101010101010101010101010101010101010101 010101010101010101010101010101010101ffdb00430101010101010101010101010101010101 010101010101010101010101010101010101010101010101010101010101010101010101010101 010101010101010101ffc0001108012c03e803012200021101031101ffc4001f00010002030002 0301000000000000000000090a07080b050602030401ffc4003f10000006030001040201030301 04070900000203040506010708090a11121314152116172223314118192532591a24576598d6d8 2933384651788497b7ffc4001a010101000301010000000000000000000000030204050106ffc4 002b11010003010100020103030402030000000002030401051112130614211522230731415124 32536162ffda000c03010002110311003f00bfc000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000000000000000 ... 005c0072006500630065007000740069006f006e005c004400650073006b0074006f0070005c00 5300690067006e006100740075007200650073005c007300690067005f005f002e006a00700067 00 01050000 00000000 }{\result{\pict\wmetafile8\picw2010\pich1905\picwgoal855\pichgoal810 0100090000033b0700000200210600000000050000000b0200000000050000000c02350038001c 000000fb02f4ff000000000000900100000001000000005365676f65205549000e0a52104c2308 00dd1900d894ef758001f3758d0e664a040000002d010000050000000902000000000500000001 02ffffff00a5000000410bc600880020002000000000002000200000000c002800000020000000 400000000100010000000000000100000000000000000000000000000000000000000000ffffff ... 0021001c001c000000fb021000070000000000bc02000000000102022253797374656d00008d0e 664a00000a0022008a0100000000ffffffff8cdd1900040000002d010100030000000000 }}}\par }
-
PRNDL Development Studios over 11 yearsThis is very close. The
pict
data is the icon (seen in image1) that the jpeg is embedded. Double clicking that icon in an RTF reader opens the actual Image I want, which I assume is embedded in theobject
data. This is what this code gets me: i.imgur.com/TXy0Gv4.png -
kmote over 11 yearsplease clarify how the .rtf file was created. How exactly was the image originally embedded into the file? You said in an earlier comment that you just dragged & dropped a .jpeg into the rtf. But when I do that, only the
pict
tag is created. I would like to create a test file with the same tags as yours (unless you can make the full text of your sample file available somewhere). -
PRNDL Development Studios over 11 yearsIn MS Word try Insert->Object->From File. That seems to do what I have. (I am using a Line of Business application to create them). An example of my file is here: 2shared.com/document/QAf6JQqO/test.html
-
kmote over 11 yearsWell, unfortunately that is not nearly as easy. The JPEG data embedded in the
object
has been encoded (most likely with[OleSaveToStream](http://msdn.microsoft.com/en-us/library/windows/desktop/ms678407(v=vs.85).aspx)
, and extracting it is going to be painful. (In C#, that is. I understand that VB has a RichTextBox control that you could load your rtf into and then access all the OLEObjects in file. Unfortunately, the RichTextBox in C# doesn't provide that access.) I'm afraid I'm at a dead end. Sorry. -
PRNDL Development Studios over 11 yearsAlright, I will have to look into the
VB
thing. Thanks for the help. -
kmote over 11 yearsWell, sorry again, but my VB comment is out of date too, unfortunately. You're not going to find OLE support in .NET at all. OLE is an archaic technology; you'd need to interface with the Win32 API (using C++ or something).
-
PRNDL Development Studios over 11 yearsYou, sir, are a wizard. Thank you.
-
Qwerty over 10 yearsBy the way, could you tell please where have you read information about OLE structure for public static PackagedObject Extract(Stream inputStream) function? I can't find anything about fields sig, DisplayName, IconFilePath in docs. @SimonMourier
-
Breeze over 7 yearsdid you ever stumble upon (and solve) the problem that some file types like .docx or .xlsx are not fully supported (they can be opened but not extracted with the code above)?
-
Simon Mourier over 7 years@Breeze - docx and xlsx are not rtf-based documents. What do you mean?
-
Breeze over 7 yearsI mean docx or xlsx files that are embedded as objects inside the rtf
-
Sivabalakrishnan over 5 yearsI have the same issue now ! please help me now. Link : stackoverflow.com/questions/52927298/… @SimonMourier
-
BoltBait about 5 years@kmote I have an rtf file with embedded graphic 120x260. After running your code above, after
image = Image.FromStream(stream);
the image is 1920x1080 (the size of my screen). Is there a way to recreate the image the original size? -
BoltBait about 5 yearsI was able to solve the size problem by reading the \picw and \pich values and resizing the image to that size. But, I'm still wondering if there's a better way to do it.