Files
PdfMarker_Web/PdfMarker/Services/PdfQuotaExtractor.cs
2026-01-23 09:57:52 +01:00

238 lines
6.5 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using System.Globalization;
using PdfMarker.Models;
using UglyToad.PdfPig;
using UglyToad.PdfPig.Content;
namespace PdfMarker.Services;
public class PdfQuotaExtractor
{
// percentuale inferiore della pagina da escludere (cartiglio)
private const double CartiglioCutoffY = 0.15; // basso
private const double CartiglioMarginX = 0.05; // bordi laterali 5%
public List<QuotaCandidate> Extract(string filePath)
{
var result = new List<QuotaCandidate>();
using var document = PdfDocument.Open(filePath);
foreach (var page in document.GetPages())
{
var pageWidth = page.Width;
var pageHeight = page.Height;
foreach (var word in page.GetWords())
{
if (!IsQuotaLikeText(word.Text))
continue;
var type = ClassifyQuota(word.Text);
if (type == QuotaType.Unknown)
continue;
var box = word.BoundingBox;
var x = box.Left / pageWidth;
var y = 1 - (box.Bottom / pageHeight);
var w = box.Width / pageWidth;
var h = box.Height / pageHeight;
// --- FILTRI CARTIGLIO ---
// parte bassa del foglio
if (y < CartiglioCutoffY)
continue;
// bordi laterali
if (x < CartiglioMarginX || x > 1 - CartiglioMarginX)
continue;
// testi troppo piccoli (tipici del cartiglio)
/*if (h < 0.003)
continue;*/
// testi troppo corti e non significativi
var t = word.Text.Trim();
if (t.Length <= 1 && !t.Contains("Ø") && !t.Contains("R") && !t.Contains("0"))
continue;
// --- AGGIUNTA QUOTA ---
result.Add(new QuotaCandidate
{
RawText = word.Text,
Type = type,
X = x,
Y = y,
Width = w,
Height = h,
IsHorizontal = box.Width >= box.Height,
IsVertical = box.Height > box.Width,
Confidence = ComputeConfidence(type, word)
});
}
}
return MergeSplitTexts(result);
}
private bool IsQuotaLikeText(string text)
{
if (string.IsNullOrWhiteSpace(text))
return false;
text = text.Replace(" ", "");
// deve contenere almeno un numero
if (!text.Any(char.IsDigit))
return false;
return
text.Contains("Ø") ||
text.StartsWith("R") ||
text.Contains("°") ||
text.Contains("x") ||
text.Contains("×") ||
IsNumeric(text);
}
private QuotaType ClassifyQuota(string text)
{
text = text.Replace(" ", "");
if (text.Contains("Ø"))
return QuotaType.Diameter;
if (text.StartsWith("R"))
return QuotaType.Radius;
if (text.Contains("°"))
return QuotaType.Angle;
if (text.Contains("x") || text.Contains("×"))
return QuotaType.Chamfer;
if (IsNumeric(text))
return QuotaType.Linear;
return QuotaType.Unknown;
}
private bool IsNumeric(string text)
{
return double.TryParse(
text.Replace(",", "."),
NumberStyles.Any,
CultureInfo.InvariantCulture,
out _);
}
private double ComputeConfidence(QuotaType type, Word word)
{
var confidence = type switch
{
QuotaType.Diameter => 0.9,
QuotaType.Radius => 0.85,
QuotaType.Angle => 0.85,
QuotaType.Chamfer => 0.8,
QuotaType.Linear => 0.6,
_ => 0.3
};
var box = word.BoundingBox;
var text = word.Text.Replace(" ", "");
// penalizza testi verticali
if (box.Height > box.Width)
confidence *= 0.7;
// penalizza numeri interi corti (1, 2, 3…)
if (IsNumeric(text) && !text.Contains(",") && !text.Contains("."))
confidence *= 0.7;
return confidence;
}
private bool IsDecimalNumber(string text)
{
// accetta 37,50 12.7 1,5
return double.TryParse(
text.Replace(",", "."),
System.Globalization.NumberStyles.Any,
System.Globalization.CultureInfo.InvariantCulture,
out _)
&& (text.Contains(",") || text.Contains("."));
}
private List<QuotaCandidate> MergeSplitTexts(List<QuotaCandidate> input)
{
var merged = new List<QuotaCandidate>();
var used = new HashSet<QuotaCandidate>();
for (int i = 0; i < input.Count; i++)
{
var current = input[i];
if (used.Contains(current))
continue;
var cluster = new List<QuotaCandidate> { current };
used.Add(current);
foreach (var other in input)
{
if (used.Contains(other))
continue;
// stesso tipo
if (other.Type != current.Type)
continue;
// molto vicini
var dx = Math.Abs(current.X - other.X);
var dy = Math.Abs(current.Y - other.Y);
if (dx < 0.02 && dy < 0.02)
{
cluster.Add(other);
used.Add(other);
}
}
if (cluster.Count == 1)
{
merged.Add(current);
continue;
}
// merge testi
var mergedText = string.Join(" ",
cluster
.Select(q => q.RawText.Trim())
.OrderBy(t => t.Length));
merged.Add(new QuotaCandidate
{
RawText = mergedText,
Type = current.Type,
X = cluster.Average(q => q.X),
Y = cluster.Average(q => q.Y),
Width = cluster.Max(q => q.Width),
Height = cluster.Max(q => q.Height),
IsHorizontal = cluster.Any(q => q.IsHorizontal),
IsVertical = cluster.Any(q => q.IsVertical),
Confidence = cluster.Average(q => q.Confidence)
});
}
return merged;
}
}