Extrait les coordonnées de chaque mot séparé en un TextChunk dans un fichier pdf

Après this actual solution Je suis en train de faire tous les mots à l'intérieur d'un TextChunk et chacun de ses coordonnées (actual page, top, bottom, left, right).Extrait les coordonnées de chaque mot séparé en un TextChunk dans un fichier pdf

Puisqu'un TextChunk pouvait être une phrase, un mot ou autre chose, j'ai essayé de le faire manuellement, en comptant sur le rectangle du dernier mot et en le coupant à chaque fois. J'ai remarqué que cette méthode manuelle pourrait être si boguée (je devrais compter manuellement sur les caractères spéciaux et ainsi de suite), donc je me suis demandé si ITextSharp offrait un moyen plus facile d'effectuer cela.

Mes Chunk et LocationTextExtractionStragy classes héritées sont les suivantes:

public class Chunk 
{ 
    public Guid Id { get; set; } 
    public Rectangle Rect { get; set; } 
    public TextRenderInfo Render { get; set; } 
    public BaseFont BF { get; set; } 
    public string Text { get; set; } 
    public int FontSize { get; set; } 


    public Chunk(Rectangle rect, TextRenderInfo renderInfo) 
    { 
     this.Rect = rect; 
     this.Render = renderInfo; 
     this.Text = Render.GetText(); 
     Initialize(); 
    } 


    public Chunk(Rectangle rect, TextRenderInfo renderInfo, string text) 
    { 
     this.Rect = rect; 
     this.Render = renderInfo; 
     this.Text = text; 
     Initialize(); 
    } 


    private void Initialize() 
    { 
     this.Id = Guid.NewGuid(); 
     this.BF = Render.GetFont(); 
     this.FontSize = ObtainFontSize(); 
    } 

    private int ObtainFontSize() 
    { 
     return Convert.ToInt32(this.Render.GetSingleSpaceWidth() * 12/this.BF.GetWidthPoint(" ", 12)); 
    } 
} 

public class LocationTextExtractionPersonalizada : LocationTextExtractionStrategy 
{ 
    //Save each coordinate 
    public List<Chunk> ChunksInPage = new List<Chunk>(); 

    //Automatically called on each chunk on PDF 
    public override void RenderText(TextRenderInfo renderInfo) 
    { 
     base.RenderText(renderInfo); 
     if (string.IsNullOrWhiteSpace(renderInfo.GetText()) 
       || renderInfo == null) 
       return; 

     //Get chunk Vectors 
     var bottomLeft = renderInfo.GetDescentLine().GetStartPoint(); 
     var topRight = renderInfo.GetAscentLine().GetEndPoint(); 

     //Create Rectangle based on previous Vectors 
     var rect = new Rectangle(
          bottomLeft[Vector.I1], 
          bottomLeft[Vector.I2], 
          topRight[Vector.I1], 
          topRight[Vector.I2]); 

     if (rect == null) 
       return; 

     //Add each chunk with its coordinates 
     ChunksInPage.Add(new Chunk(rect, renderInfo)); 
    } 
}

Donc, une fois que je reçois le fichier et ainsi de suite, je procède de cette façon:

private void ProcessContent() 
{ 
    for (int page= 1; page <= pdfReader.NumberOfPages; page++) 
    { 
     var strategy = new LocationTextExtractionPersonalizada(); 

     var currentPageText = PdfTextExtractor.GetTextFromPage(
              pdfReader, 
              page, 
              strategy); 

     //Here is where I want to get each word with its coordinates 
     var chunksWords= ChunkRawToWord(strategy.ChunksInPage); 
    } 
} 

private List<Chunk> ChunkRawToWord(IList<Chunk> chunks) 
{ 
    if (chunks == null || chunks[0] == null) 
      return null; 

    var words = new List<Chunk>(); 
    //Poor RegEx pattern to get the word and its wathever 
    string pattern = @"[@&\w+]*(-*\/*\s*\:*\;*\,*\.*\(*\)*\%*\>*\<*)?"; 

    var something = chunks[0].Render.GetCharacterRenderInfos(); 

    for (int i = 0; i < chunks.Count; i++) 
    { 
     var wordsInChunk = Regex.Matches(
              chunks[i].Text, 
              pattern, 
              RegexOptions.IgnoreCase); 


     var rectangleChunk = new Rectangle(chunks[i].Rect); 
     for (int j = 0; j < wordsInChunk.Count; j++) 
     { 
      if (string.IsNullOrWhiteSpace(wordsInChunk[j].Value)) 
       continue; 

     var word = new Chunk(
            rectangleChunk, 
            chunks[i].Render, 
            wordsInChunk[j].ToString()); 

      if (j == 0) 
      { 
       word.Rect.Right = word.BF.GetWidthPoint(word.Text, word.FontSize); 
        words.Add(word); 
        continue; 
      } 

      if (words.Count <= 0) 
       continue; 

      word.Rect.Left = words[j - 1].Rect.Right; 
      word.Rect.Right = words[j - 1].Rect.Right + word.BF.GetWidthPoint(word.Text, word.FontSize); 
      words.Add(word); 
     } 
    } 

    return words; 
}

Ensuite, je l'ai écrit un commentaire sur La solution de Mkl, étant répondu avec "use getCharacterRenderInfos()", que j'utilise et j'obtiens chaque caractère dans une liste de TextRenderInfo. Je suis désolé mais je commence à mélanger des concepts, des façons de trouver comment appliquer cette solution et de me faire plaisir.

J'apprécierais vraiment un coup de main ici. Merci d'avance.

Source

2017-10-04 Gonzo345

Vous pouvez utiliser la méthode TextRenderInfo.GetCharacterRenderInfos() pour obtenir une collection de TextRenderInfo pour chaque caractère de votre bloc. Ensuite, vous pouvez regrouper les caractères individuels en mots et calculer le rectangle contenant le mot en utilisant les coordonnées du premier et du dernier TextRenderInfo dans ce mot.

Dans votre stratégie d'extraction de texte personnalisé:

var _separators = new[] { "-", "(", ")", "/", " ", ":", ";", ",", "."}; 
protected virtual void ParseRenderInfo(TextRenderInfo currentInfo) 
    { 
     var resultInfo = new List<TextRenderInfo>(); 
     var chars = currentInfo.GetCharacterRenderInfos(); 

     foreach (var charRenderInfo in chars) 
     { 
      resultInfo.Add(charRenderInfo); 
      var currentChar = charRenderInfo.GetText(); 
      if (_separators.Contains(currentChar)) 
      { 
       ProcessWord(currentInfo, resultInfo); 
       resultInfo.Clear(); 
      } 
     } 
     ProcessWord(currentInfo, resultInfo); 
    } 
private void ProcessWord(TextRenderInfo charChunk, List<TextRenderInfo> wordChunks) 
    { 
     var firstRender = wordChunks.FirstOrDefault(); 
     var lastRender = wordChunks.LastOrDefault(); 
     if (firstRender == null || lastRender == null) 
     { 
      return; 
     } 
     var startCoords = firstRender.GetDescentLine().GetStartPoint(); 
     var endCoords = lastRender.GetAscentLine().GetEndPoint(); 
     var wordText = string.Join("", wordChunks.Select(x => x.GetText())); 
     var wordLocation = new LocationTextExtractionStrategy.TextChunkLocationDefaultImp(startCoords, endCoords, charChunk.GetSingleSpaceWidth()); 
     _chunks.Add(new CustomTextChunk(wordText, wordLocation)); 
    }

Source

2017-10-16 14:05:42

Extrait les coordonnées de chaque mot séparé en un TextChunk dans un fichier pdf

Répondre

Questions connexes