Skip to content

Creating a New Harvester

This guide will walk you through creating a new harvester for the unified property harvester system. Harvesters are responsible for scraping property data from real estate websites and submitting it to the Mill API.

Before you begin, ensure you have:

  • Go 1.24.9+ installed
  • Understanding of the common.Harvester interface
  • Familiarity with web scraping (we use Colly)
  • Access to the target website’s structure

Create a new directory for your harvester in harvesters/sources/:

Terminal window
cd harvesters/sources
mkdir my-new-harvester
cd my-new-harvester

Naming Convention:

  • Use lowercase with hyphens (e.g., realestate-com-au)
  • Match the domain name when possible
  • Keep names descriptive and unique

Create harvester.go in your new directory. All harvesters must implement the common.Harvester interface:

package my_new_harvester
import (
"fmt"
"log"
"strings"
"sync"
"time"
"github.com/gocolly/colly/v2"
"harvesters/common"
)
// Harvester implements the common.Harvester interface
type Harvester struct {
collector *colly.Collector
source string
rateLimit time.Duration
stats common.HarvesterStats
mu sync.RWMutex
}
// NewHarvester creates a new harvester instance
func NewHarvester() common.Harvester {
c := colly.NewCollector(
colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"),
)
// Set up rate limiting
c.Limit(&colly.LimitRule{
DomainGlob: "*example.com*",
Parallelism: 1,
Delay: 2 * time.Second,
})
// Add headers to appear browser-like
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
r.Headers.Set("Accept-Language", "en-US,en;q=0.5")
})
return &Harvester{
collector: c,
source: "example.com",
rateLimit: 2 * time.Second,
stats: common.HarvesterStats{},
}
}
// GetName returns the harvester name (used in CLI)
func (h *Harvester) GetName() string {
return "my-new-harvester"
}
// GetSource returns the source website domain
func (h *Harvester) GetSource() string {
return h.source
}
// SetRateLimit configures rate limiting
func (h *Harvester) SetRateLimit(delay time.Duration) {
h.mu.Lock()
defer h.mu.Unlock()
h.rateLimit = delay
h.collector.Limit(&colly.LimitRule{
DomainGlob: "*example.com*",
Parallelism: 1,
Delay: delay,
})
}
// GetStats returns current harvesting statistics
func (h *Harvester) GetStats() common.HarvesterStats {
h.mu.RLock()
defer h.mu.RUnlock()
return h.stats
}
// HealthCheck verifies the harvester can access the target website
func (h *Harvester) HealthCheck() error {
testURL := "https://www.example.com"
err := h.collector.Visit(testURL)
if err != nil {
return fmt.Errorf("failed to access %s: %w", testURL, err)
}
return nil
}
// ScrapeProperties implements the main scraping logic
func (h *Harvester) ScrapeProperties(opts common.HarvesterOptions) ([]common.Property, error) {
var properties []common.Property
var mu sync.Mutex
// Reset stats
h.mu.Lock()
h.stats = common.HarvesterStats{}
h.mu.Unlock()
// Set up collector callbacks
h.collector.OnHTML(".property-listing", func(e *colly.HTMLElement) {
property := h.parseProperty(e)
if property != nil {
mu.Lock()
properties = append(properties, *property)
h.mu.Lock()
h.stats.PropertiesScraped++
h.mu.Unlock()
mu.Unlock()
}
})
// Handle errors
h.collector.OnError(func(r *colly.Response, err error) {
h.mu.Lock()
h.stats.ErrorsEncountered++
h.mu.Unlock()
if opts.Verbose {
log.Printf("Error scraping %s: %v", r.Request.URL, err)
}
})
// Build search URL based on location
searchURL := h.buildSearchURL(opts.Location, opts.Pages)
// Visit search pages
for page := 1; page <= opts.Pages && len(properties) < opts.MaxProperties; page++ {
url := fmt.Sprintf("%s&page=%d", searchURL, page)
if opts.Verbose {
log.Printf("Scraping page %d: %s", page, url)
}
h.collector.Visit(url)
time.Sleep(h.rateLimit) // Respect rate limits
}
return properties[:min(len(properties), opts.MaxProperties)], nil
}
// parseProperty extracts property data from HTML element
func (h *Harvester) parseProperty(e *colly.HTMLElement) *common.Property {
// Extract property details from HTML
address := strings.TrimSpace(e.ChildText(".address"))
priceText := strings.TrimSpace(e.ChildText(".price"))
// Parse price (remove currency symbols, commas)
price := h.parsePrice(priceText)
// Extract other fields
bedrooms := h.parseInt(e.ChildText(".bedrooms"))
bathrooms := h.parseFloat(e.ChildText(".bathrooms"))
// Get property URL
propertyURL := e.Request.AbsoluteURL(e.ChildAttr("a", "href"))
// Collect image URLs
var imageURLs []string
e.ForEach(".property-image img", func(_ int, img *colly.HTMLElement) {
imgURL := e.Request.AbsoluteURL(img.Attr("src"))
imageURLs = append(imageURLs, imgURL)
})
return &common.Property{
SourceID: h.extractSourceID(propertyURL),
Source: h.source,
Address: address,
Price: price,
PriceCurrency: h.detectCurrency(priceText),
Bedrooms: bedrooms,
Bathrooms: bathrooms,
PropertyType: strings.TrimSpace(e.ChildText(".property-type")),
Description: strings.TrimSpace(e.ChildText(".description")),
ImageURLs: imageURLs,
SourceURL: propertyURL,
CollectionTime: time.Now(),
Region: opts.Location,
}
}
// Helper functions
func (h *Harvester) buildSearchURL(location string, pages int) string {
// Build the search URL based on your target website's structure
return fmt.Sprintf("https://www.example.com/search?location=%s", location)
}
func (h *Harvester) parsePrice(priceText string) int {
// Remove currency symbols, commas, spaces
// Convert to integer (cents or base unit)
// Return 0 if parsing fails
return 0 // Implement your parsing logic
}
func (h *Harvester) parseInt(text string) int {
// Extract integer from text
return 0 // Implement your parsing logic
}
func (h *Harvester) parseFloat(text string) float64 {
// Extract float from text
return 0.0 // Implement your parsing logic
}
func (h *Harvester) extractSourceID(url string) string {
// Extract unique ID from URL
return "" // Implement your extraction logic
}
func (h *Harvester) detectCurrency(priceText string) string {
// Detect currency from price text
return "USD" // Implement your detection logic
}

Create harvester_test.go to test your harvester:

package my_new_harvester
import (
"testing"
"harvesters/common"
)
func TestNewHarvester(t *testing.T) {
h := NewHarvester()
if h == nil {
t.Fatal("NewHarvester returned nil")
}
if h.GetName() != "my-new-harvester" {
t.Errorf("Expected name 'my-new-harvester', got '%s'", h.GetName())
}
if h.GetSource() != "example.com" {
t.Errorf("Expected source 'example.com', got '%s'", h.GetSource())
}
}
func TestHealthCheck(t *testing.T) {
h := NewHarvester()
err := h.HealthCheck()
if err != nil {
t.Logf("Health check failed (may be expected): %v", err)
}
}
func TestScrapeProperties(t *testing.T) {
h := NewHarvester()
opts := common.HarvesterOptions{
Location: "test-location",
Pages: 1,
MaxProperties: 5,
Delay: 2 * time.Second,
Verbose: true,
DryRun: true,
}
properties, err := h.ScrapeProperties(opts)
if err != nil {
t.Fatalf("ScrapeProperties failed: %v", err)
}
if len(properties) == 0 {
t.Log("No properties found (may be expected)")
}
}

Create compliance_test.go to ensure your harvester implements the interface correctly:

package my_new_harvester
import (
"testing"
"harvesters/common"
)
// TestHarvesterInterface ensures the harvester implements all required methods
func TestHarvesterInterface(t *testing.T) {
var _ common.Harvester = (*Harvester)(nil)
}

Add your harvester to harvesters/main.go:

import (
// ... existing imports
my_new_harvester "harvesters/sources/my-new-harvester"
)
func main() {
registry := common.NewHarvesterRegistry()
// ... existing registrations
registry.Register(my_new_harvester.NewHarvester())
// ... rest of main function
}

Also add a default location in the main() function’s location switch statement:

case "my-new-harvester":
*location = "default-location"

Build and test your harvester:

Terminal window
# From harvesters directory
cd harvesters
# Build the binary
go build -o bin/harvester .
# Test your harvester
./bin/harvester -harvester my-new-harvester -location test -pages 1 -max 5 -dry-run -verbose
# Run health check
./bin/harvester -harvester my-new-harvester -health-check
# Run tests
go test ./sources/my-new-harvester/...

Always implement respectful rate limiting:

// Default: 2 seconds between requests
c.Limit(&colly.LimitRule{
DomainGlob: "*example.com*",
Parallelism: 1,
Delay: 2 * time.Second,
})

Handle errors gracefully:

h.collector.OnError(func(r *colly.Response, err error) {
h.mu.Lock()
h.stats.ErrorsEncountered++
h.mu.Unlock()
if opts.Verbose {
log.Printf("Error scraping %s: %v", r.Request.URL, err)
}
})

Validate extracted data:

func (h *Harvester) parseProperty(e *colly.HTMLElement) *common.Property {
address := strings.TrimSpace(e.ChildText(".address"))
if address == "" {
return nil // Skip invalid properties
}
price := h.parsePrice(e.ChildText(".price"))
if price <= 0 {
return nil // Skip properties without valid price
}
// ... rest of parsing
}

Use realistic browser headers:

c := colly.NewCollector(
colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"),
)
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
r.Headers.Set("Accept-Language", "en-US,en;q=0.5")
r.Headers.Set("Referer", "https://www.example.com")
})

Ensure all required fields are populated:

  • SourceID: Unique identifier from the source
  • Source: Website domain
  • Address: Full property address
  • Price: Price as integer (in smallest currency unit)
  • PriceCurrency: Currency code (USD, EUR, etc.)
  • SourceURL: Direct link to property listing
  • CollectionTime: Timestamp when data was collected

Your harvester automatically integrates with Mill API when run with API credentials:

Terminal window
./bin/harvester \
-harvester my-new-harvester \
-location test \
-mill-api "http://localhost:4000/api/v1" \
-mill-api-key "your-token"

The harvester will:

  1. Scrape properties
  2. Submit them to Mill API in batches
  3. Handle authentication automatically via token service

Handle paginated results:

for page := 1; page <= opts.Pages && len(properties) < opts.MaxProperties; page++ {
url := fmt.Sprintf("%s?page=%d", baseURL, page)
h.collector.Visit(url)
time.Sleep(h.rateLimit)
}

For JavaScript-rendered content, consider:

  • Using browser automation (Playwright, Selenium)
  • Finding API endpoints the site uses
  • Using headless browsers if necessary

If you encounter anti-bot measures:

  • Increase delays between requests
  • Rotate user agents
  • Use proxy rotation (if allowed)
  • Respect robots.txt
  • Consider official APIs if available

Issue: No properties found

  • Check CSS selectors match the website structure
  • Verify the search URL is correct
  • Enable verbose mode: -verbose
  • Check if the site requires authentication

Issue: Rate limiting / 429 errors

  • Increase delay: -delay 5s
  • Reduce parallelism
  • Check if site has API limits

Issue: Parsing errors

  • Validate HTML structure hasn’t changed
  • Add defensive parsing with fallbacks
  • Log raw HTML for debugging

Issue: Health check fails

  • Verify website is accessible
  • Check network connectivity
  • Ensure URL is correct

After creating your harvester:

  1. Test thoroughly with various locations and parameters
  2. Run health checks regularly
  3. Monitor error rates in production
  4. Update selectors if website structure changes
  5. Document any special requirements or limitations

See existing harvesters for reference:

  • harvesters/sources/harcourts/ - Simple HTML scraping
  • harvesters/sources/homes-co-nz/ - Complex pagination
  • harvesters/sources/zillow/ - API-based approach