40 lines
1010 B
C#
40 lines
1010 B
C#
using PdfMarker.Models;
|
|
using UglyToad.PdfPig;
|
|
|
|
namespace PdfMarker.Services;
|
|
|
|
public class PdfTextExtractor
|
|
{
|
|
public List<TextFeature> Extract(string filePath)
|
|
{
|
|
using var doc = PdfDocument.Open(filePath);
|
|
|
|
var result = new List<TextFeature>();
|
|
|
|
foreach (var page in doc.GetPages())
|
|
{
|
|
foreach (var word in page.GetWords())
|
|
{
|
|
if (IsTechnicalFeature(word.Text))
|
|
{
|
|
result.Add(new TextFeature
|
|
{
|
|
X = word.BoundingBox.Left / page.Width,
|
|
Y = 1 - (word.BoundingBox.Bottom / page.Height),
|
|
Text = word.Text
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool IsTechnicalFeature(string text)
|
|
{
|
|
return text.Contains("Ø")
|
|
|| text.Contains("R")
|
|
|| text.Contains("x45")
|
|
|| text.Contains("H7");
|
|
}
|
|
} |