Skip to content

Commit

Permalink
some improvments in both: HeroldSpider + GelbeSeitenSpider
Browse files Browse the repository at this point in the history
  • Loading branch information
r-Larch committed Sep 26, 2019
1 parent bfb6296 commit e179e78
Show file tree
Hide file tree
Showing 8 changed files with 72 additions and 37 deletions.
2 changes: 1 addition & 1 deletion HeroldAt.sln
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.29215.179
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HeroldAt", "HeroldAt.csproj", "{04B247A5-42E4-42F3-AE30-AE129A91AB6C}"
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SpiderBot", "SpiderBot.csproj", "{04B247A5-42E4-42F3-AE30-AE129A91AB6C}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Expand Down
File renamed without changes.
3 changes: 2 additions & 1 deletion SpiderWindow.xaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@
<TextBlock DockPanel.Dock="Top" Text="{Binding Name}" FontSize="30" />
<DockPanel DockPanel.Dock="Top" Margin="5">
<TextBlock DockPanel.Dock="Top" Text="{Binding Category}" />
<TextBlock DockPanel.Dock="Top" Text="{Binding Address}" />
<TextBlock DockPanel.Dock="Top" Text="{Binding Address.StreatLine}" />
<TextBlock DockPanel.Dock="Top" Text="{Binding Address.ZipLine}" />
<TextBlock DockPanel.Dock="Top">
<TextBlock Text="{Binding Tel}"></TextBlock> / <TextBlock Text="{Binding Email}"></TextBlock> / <TextBlock Text="{Binding Website}"></TextBlock>
</TextBlock>
Expand Down
5 changes: 1 addition & 4 deletions src/Batch.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
using System;
using System.Collections.Generic;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;


namespace LarchSys.Bot {
Expand Down
48 changes: 25 additions & 23 deletions src/GelbeSeitenSpider.cs
Original file line number Diff line number Diff line change
Expand Up @@ -105,16 +105,18 @@ private static int GetPageCount(IDocument listPage)

private static IEnumerable<SearchResult> GetListItems(IDocument listPage)
{
var items = listPage.QuerySelectorAll("#gs_treffer .m08_teilnehmer");
var items = listPage.QuerySelectorAll("#gs_treffer .mod-Treffer");
foreach (var x in items) {
yield return new SearchResult {
Url = x.QuerySelector("[itemprop=\"url\"]")?.GetAttribute("href"),
Name = x.QuerySelector("[itemprop=\"name\"]")?.TextContent?.Trim(),
Url = x.QuerySelector("a")?.GetAttribute("href"),
Name = x.QuerySelector("[data-wipe-name]")?.TextContent?.Trim(),
Category = x.QuerySelector(".branchen_box span:nth-child(1)")?.TextContent,
Address = Regex.Replace(x.QuerySelector("address")?.TextContent ?? string.Empty, @"\s+", " ").Trim(),
//Tel = item.QuerySelector(".nummer")?.TextContent,
Tel = x.QuerySelector("[itemprop=\"telephone\"]")?.TextContent?.Trim(),
Img = x.QuerySelector("[data-lazy-src]")?.GetAttribute("data-lazy-src")
Address = Address.Parse(Regex.Replace(x.QuerySelector("[data-wipe-name=\"Adresse\"]")?.TextContent ?? string.Empty, @"\s+", " ").Trim()),
Tel = x.QuerySelector("[data-wipe-name=\"Kontaktdaten\"]")?.TextContent?.Trim(),
Img = x.QuerySelector("[data-lazy-src]")?.GetAttribute("data-lazy-src"),

Website = x.QuerySelector(".icon-homepage")?.ParentElement?.GetAttribute("href")?.Trim(),
Email = Regex.Replace(x.QuerySelector(".icon-email")?.ParentElement?.GetAttribute("href")?.Trim() ?? string.Empty, @"mailto:([^?]*)\??.*", "$1")
};
}
}
Expand All @@ -124,7 +126,7 @@ protected void AddResults(IEnumerable<SearchResult> searchResults)
{
var results = searchResults.ToArray();

QueueDeepScan(results);
// QueueDeepScan(results);

// more efficient then Array.Concat
var r = new SearchResult[Results.Count + results.Length];
Expand All @@ -134,26 +136,26 @@ protected void AddResults(IEnumerable<SearchResult> searchResults)
Results = new ObservableCollection<SearchResult>(r);
ResultsCount = Results.Count;

void QueueDeepScan(params SearchResult[] x)
{
WorkerTask = WorkerTask.ContinueWith(_ => Task.WaitAll(x.Select(ScanDetailPage).ToArray()));
}
//void QueueDeepScan(params SearchResult[] x)
//{
// WorkerTask = WorkerTask.ContinueWith(_ => Task.WaitAll(x.Select(ScanDetailPage).ToArray()));
//}
}


private async Task ScanDetailPage(SearchResult result)
{
if (string.IsNullOrEmpty(result.Url)) {
return;
}
//private async Task ScanDetailPage(SearchResult result)
//{
// if (string.IsNullOrEmpty(result.Url)) {
// return;
// }

var doc = await Browser.OpenAsync(result.Url);
// var doc = await Browser.OpenAsync(result.Url);

result.Email = doc.QuerySelector("[property=\"email\"]")?.GetAttribute("content");
result.Website = doc.QuerySelector("[property=\"url\"]")?.GetAttribute("href");
// result.Email = doc.QuerySelector("[property=\"email\"]")?.GetAttribute("content");
// result.Website = doc.QuerySelector("[property=\"url\"]")?.GetAttribute("href");

DeepScanCount++;
ProgressDeepScan = (int) (((double) DeepScanCount / ResultsCount) * 100d);
}
// DeepScanCount++;
// ProgressDeepScan = (int) (((double) DeepScanCount / ResultsCount) * 100d);
//}
}
}
2 changes: 1 addition & 1 deletion src/HeroldSpider.cs
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ private static IEnumerable<SearchResult> GetListItems(IDocument listPage)
Url = item.QuerySelector("meta[itemprop=\"url\"]").GetAttribute("content"),
Name = item.QuerySelector("[itemprop=\"name\"]").TextContent,
Category = item.QuerySelector(".result-item-category").TextContent,
Address = item.QuerySelector(".address").TextContent,
Address = Address.Parse(item.QuerySelector(".address").TextContent),
Tel = Regex.Replace(tel, @"^(\+\d{2})?(\d{3})(\d{4})(\d*)", "$1 $2 $3 $4").Trim(),
Img = item.QuerySelector("[itemprop=\"image\"]")?.GetAttribute("src")
};
Expand Down
34 changes: 32 additions & 2 deletions src/SearchResult.cs
Original file line number Diff line number Diff line change
@@ -1,12 +1,42 @@
namespace LarchSys.Bot {
using System.Text.RegularExpressions;


namespace LarchSys.Bot {
public class SearchResult {
public string Name { get; set; }
public string Address { get; set; }
public Address Address { get; set; }
public string Tel { get; set; }
public string Url { get; set; }
public string Category { get; set; }
public string Img { get; set; }
public string Email { get; set; }
public string Website { get; set; }
}

public class Address {
public string StreatLine { get; set; }
public string ZipLine { get; set; }

private static Regex _regex;
private static Regex Regex => _regex ??= new Regex(@"(\d{4}\d*.*)$", RegexOptions.Multiline | RegexOptions.Compiled);

public static Address Parse(string address)
{
if (string.IsNullOrEmpty(address)) {
return new Address();
}

var match = Regex.Match(address);
if (match.Success) {
return new Address {
StreatLine = address.Substring(0, match.Groups[1].Index),
ZipLine = match.Groups[1].Value
};
}

return new Address {
StreatLine = address,
};
}
}
}
15 changes: 10 additions & 5 deletions src/Spider.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using System;
using System.Collections.Generic;
using System.Collections.ObjectModel;
using System.ComponentModel;
using System.IO;
Expand Down Expand Up @@ -88,30 +87,36 @@ public virtual async Task Export()
FileName = ExportFileName,
};


if (dialog.ShowDialog(Window) == System.Windows.Forms.DialogResult.OK) {
var file = new FileInfo(dialog.FileName);
using var fs = file.OpenWrite();
using var sw = new StreamWriter(fs, Encoding.UTF8);

await sw.WriteLineAsync(Row(
"Kategorie", "Name", "Adresse", "Tel", "E-Mail", "Website", "Url"
"Kategorie", "Name", "Adresse Straße", "Adresse PLZ", "Tel", "E-Mail", "Website", "Url"
));

foreach (var _ in Results) {
await sw.WriteLineAsync(Row(
_.Category,
_.Name,
_.Address,
_.Address.StreatLine,
_.Address.ZipLine,
_.Tel,
_.Email,
_.Website,
_.Url
));
}

Status = $"{file.FullName} saved successful";
MessageBox.Show(Status, "saved", MessageBoxButton.OK);
sw.Close();

Status = $"{file?.FullName} saved successful";
}


MessageBox.Show(Status, "saved", MessageBoxButton.OK);
}
catch (Exception e) {
MessageBox.Show(e.ToString(), e.Message, MessageBoxButton.OK, MessageBoxImage.Error);
Expand Down

0 comments on commit e179e78

Please sign in to comment.